[v2] examples/ipsec-secgw: add per core packet stats

Message ID 1587647245-10524-1-git-send-email-anoobj@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series [v2] examples/ipsec-secgw: add per core packet stats |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-nxp-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/Intel-compilation success Compilation OK

Commit Message

Anoob Joseph April 23, 2020, 1:07 p.m. UTC
  Adding per core packet handling stats to analyze traffic distribution
when multiple cores are engaged.

Since aggregating the packet stats across cores would affect
performance, keeping the feature disabled using compile time flags.

Signed-off-by: Anoob Joseph <anoobj@marvell.com>
---

v2:
* Added lookup failure cases to drop count

 examples/ipsec-secgw/ipsec-secgw.c   | 118 +++++++++++++++++++++++++++++++++--
 examples/ipsec-secgw/ipsec-secgw.h   |   2 +
 examples/ipsec-secgw/ipsec.c         |  13 +++-
 examples/ipsec-secgw/ipsec.h         |  22 +++++++
 examples/ipsec-secgw/ipsec_process.c |   5 ++
 5 files changed, 154 insertions(+), 6 deletions(-)
  

Comments

Ananyev, Konstantin April 24, 2020, 11:14 a.m. UTC | #1
> Adding per core packet handling stats to analyze traffic distribution
> when multiple cores are engaged.
> 
> Since aggregating the packet stats across cores would affect
> performance, keeping the feature disabled using compile time flags.
> 
> Signed-off-by: Anoob Joseph <anoobj@marvell.com>
> ---
> 
> v2:
> * Added lookup failure cases to drop count
> 
>  examples/ipsec-secgw/ipsec-secgw.c   | 118 +++++++++++++++++++++++++++++++++--
>  examples/ipsec-secgw/ipsec-secgw.h   |   2 +
>  examples/ipsec-secgw/ipsec.c         |  13 +++-
>  examples/ipsec-secgw/ipsec.h         |  22 +++++++
>  examples/ipsec-secgw/ipsec_process.c |   5 ++
>  5 files changed, 154 insertions(+), 6 deletions(-)
> 
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> index 6d02341..db92ddc 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -288,6 +288,61 @@ adjust_ipv6_pktlen(struct rte_mbuf *m, const struct rte_ipv6_hdr *iph,
>  	}
>  }
> 
> +#ifdef ENABLE_STATS
> +static uint64_t timer_period = 10; /* default period is 10 seconds */

I think it is better to add user ability to control stats period.
Either runtime-option, or just compile time: 
replace ENABLE_STATS with STATS_PERIOD (0 would mean stats disabled).

> +
> +/* Print out statistics on packet distribution */
> +static void
> +print_stats(void)
> +{
> +	uint64_t total_packets_dropped, total_packets_tx, total_packets_rx;
> +	unsigned int coreid;
> +	float burst_percent;
> +
> +	total_packets_dropped = 0;
> +	total_packets_tx = 0;
> +	total_packets_rx = 0;
> +
> +	const char clr[] = { 27, '[', '2', 'J', '\0' };
> +	const char topLeft[] = { 27, '[', '1', ';', '1', 'H', '\0' };
> +
> +	/* Clear screen and move to top left */
> +	printf("%s%s", clr, topLeft);

Is that really needed?

> +
> +	printf("\nCore statistics ====================================");
> +
> +	for (coreid = 0; coreid < RTE_MAX_LCORE; coreid++) {
> +		/* skip disabled cores */
> +		if (rte_lcore_is_enabled(coreid) == 0)
> +			continue;
> +		burst_percent = (float)(core_statistics[coreid].burst_rx * 100)/
> +					core_statistics[coreid].rx;

Would float be always enough here? Might better long double?

> +		printf("\nStatistics for core %u ------------------------------"
> +			   "\nPackets received: %20"PRIu64
> +			   "\nPackets sent: %24"PRIu64
> +			   "\nPackets dropped: %21"PRIu64
> +			   "\nBurst percent: %23.2f",
> +			   coreid,
> +			   core_statistics[coreid].rx,
> +			   core_statistics[coreid].tx,
> +			   core_statistics[coreid].dropped,
> +			   burst_percent);
> +
> +		total_packets_dropped += core_statistics[coreid].dropped;
> +		total_packets_tx += core_statistics[coreid].tx;
> +		total_packets_rx += core_statistics[coreid].rx;
> +	}
> +	printf("\nAggregate statistics ==============================="
> +		   "\nTotal packets received: %14"PRIu64
> +		   "\nTotal packets sent: %18"PRIu64
> +		   "\nTotal packets dropped: %15"PRIu64,
> +		   total_packets_rx,
> +		   total_packets_tx,
> +		   total_packets_dropped);
> +	printf("\n====================================================\n");
> +}
> +#endif /* ENABLE_STATS */
> +
>  static inline void
>  prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t)
>  {
> @@ -333,6 +388,7 @@ prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t)
> 
>  		/* drop packet when IPv6 header exceeds first segment length */
>  		if (unlikely(l3len > pkt->data_len)) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(pkt);
>  			return;
>  		}
> @@ -350,6 +406,7 @@ prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t)
>  		/* Unknown/Unsupported type, drop the packet */
>  		RTE_LOG(ERR, IPSEC, "Unsupported packet type 0x%x\n",
>  			rte_be_to_cpu_16(eth->ether_type));
> +		core_stats_update_drop(1);
>  		rte_pktmbuf_free(pkt);
>  		return;
>  	}
> @@ -471,6 +528,11 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
>  	int32_t ret;
>  	uint16_t queueid;
> 
> +#ifdef ENABLE_STATS
> +	int lcore_id = rte_lcore_id();
> +	core_statistics[lcore_id].tx += n;
> +#endif /* ENABLE_STATS */

Instead of polluting genric code with ifdefs, why not
to introduce 2 new functions: core_stats_update_rx(), core_stats_update_tx(),
as you did for core_stats_drop()?

> +
>  	queueid = qconf->tx_queue_id[port];
>  	m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
> 
> @@ -478,6 +540,9 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
> 
>  	ret = rte_eth_tx_burst(port, queueid, m_table, n);
>  	if (unlikely(ret < n)) {
> +#ifdef ENABLE_STATS
> +		core_statistics[lcore_id].dropped += n-ret;
> +#endif /* ENABLE_STATS */

You have core_stats_update_drop() for that - use it.

>  		do {
>  			rte_pktmbuf_free(m_table[ret]);
>  		} while (++ret < n);
> @@ -525,6 +590,7 @@ send_fragment_packet(struct lcore_conf *qconf, struct rte_mbuf *m,
>  			"error code: %d\n",
>  			__func__, m->pkt_len, rte_errno);
> 
> +	core_stats_update_drop(1);
>  	rte_pktmbuf_free(m);
>  	return len;
>  }
> @@ -549,8 +615,10 @@ send_single_packet(struct rte_mbuf *m, uint16_t port, uint8_t proto)
>  	/* need to fragment the packet */
>  	} else if (frag_tbl_sz > 0)
>  		len = send_fragment_packet(qconf, m, port, proto);
> -	else
> +	else {
> +		core_stats_update_drop(1);
>  		rte_pktmbuf_free(m);

It looks like a lot of such places...
Would it be worth to unite core_stats_update_drop() and rte_pktmbuf_free(m)
Into some inline function: ipsec_secgw_packet_drop(struct rte_mbuf *m[], uint32_t n) 
and use it all over such places. 

> +	}
> 
>  	/* enough pkts to be sent */
>  	if (unlikely(len == MAX_PKT_BURST)) {
> @@ -584,18 +652,21 @@ inbound_sp_sa(struct sp_ctx *sp, struct sa_ctx *sa, struct traffic_type *ip,
>  			continue;
>  		}
>  		if (res == DISCARD) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(m);
>  			continue;
>  		}
> 
>  		/* Only check SPI match for processed IPSec packets */
>  		if (i < lim && ((m->ol_flags & PKT_RX_SEC_OFFLOAD) == 0)) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(m);
>  			continue;
>  		}
> 
>  		sa_idx = res - 1;
>  		if (!inbound_sa_check(sa, m, sa_idx)) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(m);
>  			continue;
>  		}
> @@ -630,8 +701,10 @@ split46_traffic(struct ipsec_traffic *trf, struct rte_mbuf *mb[], uint32_t num)
>  					uint8_t *,
>  					offsetof(struct ip6_hdr, ip6_nxt));
>  			n6++;
> -		} else
> +		} else {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(m);
> +		}
>  	}
> 
>  	trf->ip4.num = n4;
> @@ -682,11 +755,12 @@ outbound_sp(struct sp_ctx *sp, struct traffic_type *ip,
>  	for (i = 0; i < ip->num; i++) {
>  		m = ip->pkts[i];
>  		sa_idx = ip->res[i] - 1;
> -		if (ip->res[i] == DISCARD)
> +		if (ip->res[i] == DISCARD) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(m);
> -		else if (ip->res[i] == BYPASS)
> +		} else if (ip->res[i] == BYPASS) {

Looks unnecessary.

>  			ip->pkts[j++] = m;
> -		else {
> +		} else {
>  			ipsec->res[ipsec->num] = sa_idx;
>  			ipsec->pkts[ipsec->num++] = m;
>  		}
> @@ -705,6 +779,8 @@ process_pkts_outbound(struct ipsec_ctx *ipsec_ctx,
>  	for (i = 0; i < traffic->ipsec.num; i++)
>  		rte_pktmbuf_free(traffic->ipsec.pkts[i]);
> 
> +	core_stats_update_drop(traffic->ipsec.num);
> +
>  	traffic->ipsec.num = 0;
> 
>  	outbound_sp(ipsec_ctx->sp4_ctx, &traffic->ip4, &traffic->ipsec);
> @@ -745,12 +821,14 @@ process_pkts_inbound_nosp(struct ipsec_ctx *ipsec_ctx,
>  	/* Drop any IPv4 traffic from unprotected ports */
>  	for (i = 0; i < traffic->ip4.num; i++)
>  		rte_pktmbuf_free(traffic->ip4.pkts[i]);
> +	core_stats_update_drop(traffic->ip4.num);
> 
>  	traffic->ip4.num = 0;
> 
>  	/* Drop any IPv6 traffic from unprotected ports */
>  	for (i = 0; i < traffic->ip6.num; i++)
>  		rte_pktmbuf_free(traffic->ip6.pkts[i]);
> +	core_stats_update_drop(traffic->ip6.num);
> 
>  	traffic->ip6.num = 0;
> 
> @@ -788,6 +866,7 @@ process_pkts_outbound_nosp(struct ipsec_ctx *ipsec_ctx,
>  	/* Drop any IPsec traffic from protected ports */
>  	for (i = 0; i < traffic->ipsec.num; i++)
>  		rte_pktmbuf_free(traffic->ipsec.pkts[i]);
> +	core_stats_update_drop(traffic->ipsec.num);
> 
>  	n = 0;
> 
> @@ -901,6 +980,7 @@ route4_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf *pkts[], uint8_t nb_pkts)
>  		}
> 
>  		if ((pkt_hop & RTE_LPM_LOOKUP_SUCCESS) == 0) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(pkts[i]);
>  			continue;
>  		}
> @@ -953,6 +1033,7 @@ route6_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf *pkts[], uint8_t nb_pkts)
>  		}
> 
>  		if (pkt_hop == -1) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(pkts[i]);
>  			continue;
>  		}
> @@ -1099,6 +1180,9 @@ ipsec_poll_mode_worker(void)
>  	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1)
>  			/ US_PER_S * BURST_TX_DRAIN_US;
>  	struct lcore_rx_queue *rxql;
> +#ifdef ENABLE_STATS
> +	uint64_t timer_tsc = 0;
> +#endif /* ENABLE_STATS */

Probably better just RTE_SET_USED(timer_tsc);

> 
>  	prev_tsc = 0;
>  	lcore_id = rte_lcore_id();
> @@ -1159,6 +1243,19 @@ ipsec_poll_mode_worker(void)
>  			drain_tx_buffers(qconf);
>  			drain_crypto_buffers(qconf);
>  			prev_tsc = cur_tsc;
> +#ifdef ENABLE_STATS
> +			if (lcore_id == rte_get_master_lcore()) {
> +				/* advance the timer */
> +				timer_tsc += diff_tsc;
> +
> +				/* if timer has reached its timeout */
> +				if (unlikely(timer_tsc >= timer_period)) {
> +					print_stats();
> +					/* reset the timer */
> +					timer_tsc = 0;
> +				}
> +			}
> +#endif /* ENABLE_STATS */


Why to do stats collection/display inside data-path?
Why not use rte_timer/rte_alarm and make it happen in control thread?
Another option - make stats printing at some signal to the process.
In that case we don't need to bother with time period at all - it will be
user to decide.
Again if we remove that print_stats() from data-path it might become 
cheap enough to always collect it, and we will not need ENABLE_STATS
macro at all.

>  		}
> 
>  		for (i = 0; i < qconf->nb_rx_queue; ++i) {
> @@ -1169,6 +1266,12 @@ ipsec_poll_mode_worker(void)
>  			nb_rx = rte_eth_rx_burst(portid, queueid,
>  					pkts, MAX_PKT_BURST);
> 
> +#ifdef ENABLE_STATS
> +			core_statistics[lcore_id].rx += nb_rx;
> +			if (nb_rx == MAX_PKT_BURST)
> +				core_statistics[lcore_id].burst_rx += nb_rx;
> +#endif /* ENABLE_STATS */
> +

Same for above for TX: no need to pollute the code with ifdefs.
Better to introduce new function: stats_update_rx() or so.


>  			if (nb_rx > 0)
>  				process_pkts(qconf, pkts, nb_rx, portid);
> 
> @@ -2747,6 +2850,11 @@ main(int32_t argc, char **argv)
>  	signal(SIGINT, signal_handler);
>  	signal(SIGTERM, signal_handler);
> 
> +#ifdef ENABLE_STATS
> +	/* convert to number of cycles */
> +	timer_period *= rte_get_timer_hz();
> +#endif /* ENABLE_STATS */
> +
>  	/* initialize event helper configuration */
>  	eh_conf = eh_conf_init();
>  	if (eh_conf == NULL)
> diff --git a/examples/ipsec-secgw/ipsec-secgw.h b/examples/ipsec-secgw/ipsec-secgw.h
> index 4b53cb5..d886a35 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.h
> +++ b/examples/ipsec-secgw/ipsec-secgw.h
> @@ -6,6 +6,8 @@
> 
>  #include <stdbool.h>
> 
> +//#define ENABLE_STATS
> +

Should be removed I think.

>  #define NB_SOCKETS 4
> 
>  #define MAX_PKT_BURST 32
> diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
> index bf88d80..dcb9312 100644
> --- a/examples/ipsec-secgw/ipsec.c
> +++ b/examples/ipsec-secgw/ipsec.c
> @@ -499,8 +499,10 @@ enqueue_cop_burst(struct cdev_qp *cqp)
>  			" enqueued %u crypto ops out of %u\n",
>  			cqp->id, cqp->qp, ret, len);
>  			/* drop packets that we fail to enqueue */
> -			for (i = ret; i < len; i++)
> +			for (i = ret; i < len; i++) {
> +				core_stats_update_drop(1);
>  				rte_pktmbuf_free(cqp->buf[i]->sym->m_src);
> +			}
>  	}
>  	cqp->in_flight += ret;
>  	cqp->len = 0;
> @@ -528,6 +530,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
> 
>  	for (i = 0; i < nb_pkts; i++) {
>  		if (unlikely(sas[i] == NULL)) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(pkts[i]);
>  			continue;
>  		}
> @@ -549,6 +552,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
> 
>  			if ((unlikely(ips->security.ses == NULL)) &&
>  				create_lookaside_session(ipsec_ctx, sa, ips)) {
> +				core_stats_update_drop(1);
>  				rte_pktmbuf_free(pkts[i]);
>  				continue;
>  			}
> @@ -563,6 +567,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
>  		case RTE_SECURITY_ACTION_TYPE_CPU_CRYPTO:
>  			RTE_LOG(ERR, IPSEC, "CPU crypto is not supported by the"
>  					" legacy mode.");
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(pkts[i]);
>  			continue;
> 
> @@ -575,6 +580,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
> 
>  			if ((unlikely(ips->crypto.ses == NULL)) &&
>  				create_lookaside_session(ipsec_ctx, sa, ips)) {
> +				core_stats_update_drop(1);
>  				rte_pktmbuf_free(pkts[i]);
>  				continue;
>  			}
> @@ -584,6 +590,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
> 
>  			ret = xform_func(pkts[i], sa, &priv->cop);
>  			if (unlikely(ret)) {
> +				core_stats_update_drop(1);
>  				rte_pktmbuf_free(pkts[i]);
>  				continue;
>  			}
> @@ -608,6 +615,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
> 
>  			ret = xform_func(pkts[i], sa, &priv->cop);
>  			if (unlikely(ret)) {
> +				core_stats_update_drop(1);
>  				rte_pktmbuf_free(pkts[i]);
>  				continue;
>  			}
> @@ -643,6 +651,7 @@ ipsec_inline_dequeue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
>  		sa = priv->sa;
>  		ret = xform_func(pkt, sa, &priv->cop);
>  		if (unlikely(ret)) {
> +			core_stats_update_drop(1);
>  			rte_pktmbuf_free(pkt);
>  			continue;
>  		}
> @@ -690,12 +699,14 @@ ipsec_dequeue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
>  				RTE_SECURITY_ACTION_TYPE_NONE) {
>  				ret = xform_func(pkt, sa, cops[j]);
>  				if (unlikely(ret)) {
> +					core_stats_update_drop(1);
>  					rte_pktmbuf_free(pkt);
>  					continue;
>  				}
>  			} else if (ipsec_get_action_type(sa) ==
>  				RTE_SECURITY_ACTION_TYPE_LOOKASIDE_PROTOCOL) {
>  				if (cops[j]->status) {
> +					core_stats_update_drop(1);
>  					rte_pktmbuf_free(pkt);
>  					continue;
>  				}
> diff --git a/examples/ipsec-secgw/ipsec.h b/examples/ipsec-secgw/ipsec.h
> index 1e642d1..8519eab 100644
> --- a/examples/ipsec-secgw/ipsec.h
> +++ b/examples/ipsec-secgw/ipsec.h
> @@ -46,6 +46,17 @@
> 
>  #define IP6_VERSION (6)
> 
> +#ifdef ENABLE_STATS
> +struct ipsec_core_statistics {
> +	uint64_t tx;
> +	uint64_t rx;
> +	uint64_t dropped;
> +	uint64_t burst_rx;

A bit strange to have burst_rx and no similar counterpart for tx.
BTW, do you need burst_rx?
Might be better:
nb_calls_rx, nb_calls_tx;
and then: rx/nb_calls_rx will give you average burst size.

> +} __rte_cache_aligned;
> +
> +struct ipsec_core_statistics core_statistics[RTE_MAX_ETHPORTS];

Should be RTE_MAX_LCORES, I think.

> +#endif /* ENABLE_STATS */
> +
>  struct rte_crypto_xform;
>  struct ipsec_xform;
>  struct rte_mbuf;
> @@ -416,4 +427,15 @@ check_flow_params(uint16_t fdir_portid, uint8_t fdir_qid);
>  int
>  create_ipsec_esp_flow(struct ipsec_sa *sa);
> 
> +static inline void
> +core_stats_update_drop(int n)
> +{
> +#ifdef ENABLE_STATS
> +	int lcore_id = rte_lcore_id();
> +	core_statistics[lcore_id].dropped += n;
> +#else
> +	RTE_SET_USED(n);
> +#endif /* ENABLE_STATS */
> +}
> +
>  #endif /* __IPSEC_H__ */
> diff --git a/examples/ipsec-secgw/ipsec_process.c b/examples/ipsec-secgw/ipsec_process.c
> index bb2f2b8..05cb3ad 100644
> --- a/examples/ipsec-secgw/ipsec_process.c
> +++ b/examples/ipsec-secgw/ipsec_process.c
> @@ -24,6 +24,11 @@ free_pkts(struct rte_mbuf *mb[], uint32_t n)
>  {
>  	uint32_t i;
> 
> +#ifdef ENABLE_STATS
> +	int lcore_id = rte_lcore_id();
> +	core_statistics[lcore_id].dropped += n;
> +#endif /* ENABLE_STATS */
> +

Same as above - why not use stats_update_drop() here?

>  	for (i = 0; i != n; i++)
>  		rte_pktmbuf_free(mb[i]);
>  }
> --
> 2.7.4
  
Anoob Joseph May 2, 2020, 6:43 p.m. UTC | #2
Hi Konstantin,

Thanks for the review. Please see inline.

@Akhil, do you have any comments?

Thanks,
Anoob

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Friday, April 24, 2020 4:44 PM
> To: Anoob Joseph <anoobj@marvell.com>; Akhil Goyal <akhil.goyal@nxp.com>;
> Nicolau, Radu <radu.nicolau@intel.com>
> Cc: Narayana Prasad Raju Athreya <pathreya@marvell.com>; dev@dpdk.org
> Subject: [EXT] RE: [dpdk-dev] [PATCH v2] examples/ipsec-secgw: add per core
> packet stats
> 
> External Email
> 
> ----------------------------------------------------------------------
> 
> > Adding per core packet handling stats to analyze traffic distribution
> > when multiple cores are engaged.
> >
> > Since aggregating the packet stats across cores would affect
> > performance, keeping the feature disabled using compile time flags.
> >
> > Signed-off-by: Anoob Joseph <anoobj@marvell.com>
> > ---
> >
> > v2:
> > * Added lookup failure cases to drop count
> >
> >  examples/ipsec-secgw/ipsec-secgw.c   | 118
> +++++++++++++++++++++++++++++++++--
> >  examples/ipsec-secgw/ipsec-secgw.h   |   2 +
> >  examples/ipsec-secgw/ipsec.c         |  13 +++-
> >  examples/ipsec-secgw/ipsec.h         |  22 +++++++
> >  examples/ipsec-secgw/ipsec_process.c |   5 ++
> >  5 files changed, 154 insertions(+), 6 deletions(-)
> >
> > diff --git a/examples/ipsec-secgw/ipsec-secgw.c
> > b/examples/ipsec-secgw/ipsec-secgw.c
> > index 6d02341..db92ddc 100644
> > --- a/examples/ipsec-secgw/ipsec-secgw.c
> > +++ b/examples/ipsec-secgw/ipsec-secgw.c
> > @@ -288,6 +288,61 @@ adjust_ipv6_pktlen(struct rte_mbuf *m, const struct
> rte_ipv6_hdr *iph,
> >  	}
> >  }
> >
> > +#ifdef ENABLE_STATS
> > +static uint64_t timer_period = 10; /* default period is 10 seconds */
> 
> I think it is better to add user ability to control stats period.
> Either runtime-option, or just compile time:
> replace ENABLE_STATS with STATS_PERIOD (0 would mean stats disabled).

[Anoob] That's a good suggestion. Will make this change as part of v3.
 
> 
> > +
> > +/* Print out statistics on packet distribution */ static void
> > +print_stats(void)
> > +{
> > +	uint64_t total_packets_dropped, total_packets_tx, total_packets_rx;
> > +	unsigned int coreid;
> > +	float burst_percent;
> > +
> > +	total_packets_dropped = 0;
> > +	total_packets_tx = 0;
> > +	total_packets_rx = 0;
> > +
> > +	const char clr[] = { 27, '[', '2', 'J', '\0' };
> > +	const char topLeft[] = { 27, '[', '1', ';', '1', 'H', '\0' };
> > +
> > +	/* Clear screen and move to top left */
> > +	printf("%s%s", clr, topLeft);
> 
> Is that really needed?

[Anoob] I had just copied from l2fwd and worked from that. But it seems like, 'topleft' is not needed. Will remove that. Without a clearscreen, the prints would just crowd the screen. I would like to retain clearscreen.
 
> 
> > +
> > +	printf("\nCore statistics ====================================");
> > +
> > +	for (coreid = 0; coreid < RTE_MAX_LCORE; coreid++) {
> > +		/* skip disabled cores */
> > +		if (rte_lcore_is_enabled(coreid) == 0)
> > +			continue;
> > +		burst_percent = (float)(core_statistics[coreid].burst_rx * 100)/
> > +					core_statistics[coreid].rx;
> 
> Would float be always enough here? Might better long double?

[Anoob] This is just to get the percentage of traffic doing bursts. Since it's percentage float seemed enough. Do you think we need double instead? I can change if you think so.
 
> 
> > +		printf("\nStatistics for core %u ------------------------------"
> > +			   "\nPackets received: %20"PRIu64
> > +			   "\nPackets sent: %24"PRIu64
> > +			   "\nPackets dropped: %21"PRIu64
> > +			   "\nBurst percent: %23.2f",
> > +			   coreid,
> > +			   core_statistics[coreid].rx,
> > +			   core_statistics[coreid].tx,
> > +			   core_statistics[coreid].dropped,
> > +			   burst_percent);
> > +
> > +		total_packets_dropped += core_statistics[coreid].dropped;
> > +		total_packets_tx += core_statistics[coreid].tx;
> > +		total_packets_rx += core_statistics[coreid].rx;
> > +	}
> > +	printf("\nAggregate statistics ==============================="
> > +		   "\nTotal packets received: %14"PRIu64
> > +		   "\nTotal packets sent: %18"PRIu64
> > +		   "\nTotal packets dropped: %15"PRIu64,
> > +		   total_packets_rx,
> > +		   total_packets_tx,
> > +		   total_packets_dropped);
> > +
> 	printf("\n===================================================
> =\n");
> > +}
> > +#endif /* ENABLE_STATS */
> > +
> >  static inline void
> >  prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t)  {
> > @@ -333,6 +388,7 @@ prepare_one_packet(struct rte_mbuf *pkt, struct
> > ipsec_traffic *t)
> >
> >  		/* drop packet when IPv6 header exceeds first segment length
> */
> >  		if (unlikely(l3len > pkt->data_len)) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(pkt);
> >  			return;
> >  		}
> > @@ -350,6 +406,7 @@ prepare_one_packet(struct rte_mbuf *pkt, struct
> ipsec_traffic *t)
> >  		/* Unknown/Unsupported type, drop the packet */
> >  		RTE_LOG(ERR, IPSEC, "Unsupported packet type 0x%x\n",
> >  			rte_be_to_cpu_16(eth->ether_type));
> > +		core_stats_update_drop(1);
> >  		rte_pktmbuf_free(pkt);
> >  		return;
> >  	}
> > @@ -471,6 +528,11 @@ send_burst(struct lcore_conf *qconf, uint16_t n,
> uint16_t port)
> >  	int32_t ret;
> >  	uint16_t queueid;
> >
> > +#ifdef ENABLE_STATS
> > +	int lcore_id = rte_lcore_id();
> > +	core_statistics[lcore_id].tx += n;
> > +#endif /* ENABLE_STATS */
> 
> Instead of polluting genric code with ifdefs, why not to introduce 2 new
> functions: core_stats_update_rx(), core_stats_update_tx(), as you did for
> core_stats_drop()?

[Anoob] The drop would happen from multiple places while Rx & Tx would happen only from one place each. I wasn't sure which would be the right approach. If you agree to introduce rx, tx & drop stats functions, I will update so.

> 
> > +
> >  	queueid = qconf->tx_queue_id[port];
> >  	m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
> >
> > @@ -478,6 +540,9 @@ send_burst(struct lcore_conf *qconf, uint16_t n,
> > uint16_t port)
> >
> >  	ret = rte_eth_tx_burst(port, queueid, m_table, n);
> >  	if (unlikely(ret < n)) {
> > +#ifdef ENABLE_STATS
> > +		core_statistics[lcore_id].dropped += n-ret; #endif /*
> ENABLE_STATS
> > +*/
> 
> You have core_stats_update_drop() for that - use it.

[Anoob] lcore_id is already available as local variable. Will add the call instead. 
 
> 
> >  		do {
> >  			rte_pktmbuf_free(m_table[ret]);
> >  		} while (++ret < n);
> > @@ -525,6 +590,7 @@ send_fragment_packet(struct lcore_conf *qconf,
> struct rte_mbuf *m,
> >  			"error code: %d\n",
> >  			__func__, m->pkt_len, rte_errno);
> >
> > +	core_stats_update_drop(1);
> >  	rte_pktmbuf_free(m);
> >  	return len;
> >  }
> > @@ -549,8 +615,10 @@ send_single_packet(struct rte_mbuf *m, uint16_t
> port, uint8_t proto)
> >  	/* need to fragment the packet */
> >  	} else if (frag_tbl_sz > 0)
> >  		len = send_fragment_packet(qconf, m, port, proto);
> > -	else
> > +	else {
> > +		core_stats_update_drop(1);
> >  		rte_pktmbuf_free(m);
> 
> It looks like a lot of such places...
> Would it be worth to unite core_stats_update_drop() and rte_pktmbuf_free(m)
> Into some inline function: ipsec_secgw_packet_drop(struct rte_mbuf *m[],
> uint32_t n) and use it all over such places.

[Anoob] Ack
 
> 
> > +	}
> >
> >  	/* enough pkts to be sent */
> >  	if (unlikely(len == MAX_PKT_BURST)) { @@ -584,18 +652,21 @@
> > inbound_sp_sa(struct sp_ctx *sp, struct sa_ctx *sa, struct traffic_type *ip,
> >  			continue;
> >  		}
> >  		if (res == DISCARD) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(m);
> >  			continue;
> >  		}
> >
> >  		/* Only check SPI match for processed IPSec packets */
> >  		if (i < lim && ((m->ol_flags & PKT_RX_SEC_OFFLOAD) == 0)) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(m);
> >  			continue;
> >  		}
> >
> >  		sa_idx = res - 1;
> >  		if (!inbound_sa_check(sa, m, sa_idx)) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(m);
> >  			continue;
> >  		}
> > @@ -630,8 +701,10 @@ split46_traffic(struct ipsec_traffic *trf, struct
> rte_mbuf *mb[], uint32_t num)
> >  					uint8_t *,
> >  					offsetof(struct ip6_hdr, ip6_nxt));
> >  			n6++;
> > -		} else
> > +		} else {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(m);
> > +		}
> >  	}
> >
> >  	trf->ip4.num = n4;
> > @@ -682,11 +755,12 @@ outbound_sp(struct sp_ctx *sp, struct traffic_type
> *ip,
> >  	for (i = 0; i < ip->num; i++) {
> >  		m = ip->pkts[i];
> >  		sa_idx = ip->res[i] - 1;
> > -		if (ip->res[i] == DISCARD)
> > +		if (ip->res[i] == DISCARD) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(m);
> > -		else if (ip->res[i] == BYPASS)
> > +		} else if (ip->res[i] == BYPASS) {
> 
> Looks unnecessary.
> 
> >  			ip->pkts[j++] = m;
> > -		else {
> > +		} else {
> >  			ipsec->res[ipsec->num] = sa_idx;
> >  			ipsec->pkts[ipsec->num++] = m;
> >  		}
> > @@ -705,6 +779,8 @@ process_pkts_outbound(struct ipsec_ctx *ipsec_ctx,
> >  	for (i = 0; i < traffic->ipsec.num; i++)
> >  		rte_pktmbuf_free(traffic->ipsec.pkts[i]);
> >
> > +	core_stats_update_drop(traffic->ipsec.num);
> > +
> >  	traffic->ipsec.num = 0;
> >
> >  	outbound_sp(ipsec_ctx->sp4_ctx, &traffic->ip4, &traffic->ipsec); @@
> > -745,12 +821,14 @@ process_pkts_inbound_nosp(struct ipsec_ctx *ipsec_ctx,
> >  	/* Drop any IPv4 traffic from unprotected ports */
> >  	for (i = 0; i < traffic->ip4.num; i++)
> >  		rte_pktmbuf_free(traffic->ip4.pkts[i]);
> > +	core_stats_update_drop(traffic->ip4.num);
> >
> >  	traffic->ip4.num = 0;
> >
> >  	/* Drop any IPv6 traffic from unprotected ports */
> >  	for (i = 0; i < traffic->ip6.num; i++)
> >  		rte_pktmbuf_free(traffic->ip6.pkts[i]);
> > +	core_stats_update_drop(traffic->ip6.num);
> >
> >  	traffic->ip6.num = 0;
> >
> > @@ -788,6 +866,7 @@ process_pkts_outbound_nosp(struct ipsec_ctx
> *ipsec_ctx,
> >  	/* Drop any IPsec traffic from protected ports */
> >  	for (i = 0; i < traffic->ipsec.num; i++)
> >  		rte_pktmbuf_free(traffic->ipsec.pkts[i]);
> > +	core_stats_update_drop(traffic->ipsec.num);
> >
> >  	n = 0;
> >
> > @@ -901,6 +980,7 @@ route4_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf
> *pkts[], uint8_t nb_pkts)
> >  		}
> >
> >  		if ((pkt_hop & RTE_LPM_LOOKUP_SUCCESS) == 0) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(pkts[i]);
> >  			continue;
> >  		}
> > @@ -953,6 +1033,7 @@ route6_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf
> *pkts[], uint8_t nb_pkts)
> >  		}
> >
> >  		if (pkt_hop == -1) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(pkts[i]);
> >  			continue;
> >  		}
> > @@ -1099,6 +1180,9 @@ ipsec_poll_mode_worker(void)
> >  	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1)
> >  			/ US_PER_S * BURST_TX_DRAIN_US;
> >  	struct lcore_rx_queue *rxql;
> > +#ifdef ENABLE_STATS
> > +	uint64_t timer_tsc = 0;
> > +#endif /* ENABLE_STATS */
> 
> Probably better just RTE_SET_USED(timer_tsc);

[Anoob] Isn't it better to keep all the code related an optional feature under on flag? Either way is fine.
 
> 
> >
> >  	prev_tsc = 0;
> >  	lcore_id = rte_lcore_id();
> > @@ -1159,6 +1243,19 @@ ipsec_poll_mode_worker(void)
> >  			drain_tx_buffers(qconf);
> >  			drain_crypto_buffers(qconf);
> >  			prev_tsc = cur_tsc;
> > +#ifdef ENABLE_STATS
> > +			if (lcore_id == rte_get_master_lcore()) {
> > +				/* advance the timer */
> > +				timer_tsc += diff_tsc;
> > +
> > +				/* if timer has reached its timeout */
> > +				if (unlikely(timer_tsc >= timer_period)) {
> > +					print_stats();
> > +					/* reset the timer */
> > +					timer_tsc = 0;
> > +				}
> > +			}
> > +#endif /* ENABLE_STATS */
> 
> 
> Why to do stats collection/display inside data-path?
> Why not use rte_timer/rte_alarm and make it happen in control thread?
> Another option - make stats printing at some signal to the process.
> In that case we don't need to bother with time period at all - it will be user to
> decide.
> Again if we remove that print_stats() from data-path it might become cheap
> enough to always collect it, and we will not need ENABLE_STATS macro at all.

[Anoob] The stats collection will not be cheap because we are updating a common structure from multiple cores. And the idea is not to have a very efficient stats mechanism. When we have more cores engaged, IPsec performance is directly dependent on ability of RSS to split traffic to multiple cores. Such stats would come handy in understanding per core distribution of incoming traffic. 
 
> 
> >  		}
> >
> >  		for (i = 0; i < qconf->nb_rx_queue; ++i) { @@ -1169,6 +1266,12
> @@
> > ipsec_poll_mode_worker(void)
> >  			nb_rx = rte_eth_rx_burst(portid, queueid,
> >  					pkts, MAX_PKT_BURST);
> >
> > +#ifdef ENABLE_STATS
> > +			core_statistics[lcore_id].rx += nb_rx;
> > +			if (nb_rx == MAX_PKT_BURST)
> > +				core_statistics[lcore_id].burst_rx += nb_rx;
> #endif /*
> > +ENABLE_STATS */
> > +
> 
> Same for above for TX: no need to pollute the code with ifdefs.
> Better to introduce new function: stats_update_rx() or so.

[Anoob] Ack

> 
> 
> >  			if (nb_rx > 0)
> >  				process_pkts(qconf, pkts, nb_rx, portid);
> >
> > @@ -2747,6 +2850,11 @@ main(int32_t argc, char **argv)
> >  	signal(SIGINT, signal_handler);
> >  	signal(SIGTERM, signal_handler);
> >
> > +#ifdef ENABLE_STATS
> > +	/* convert to number of cycles */
> > +	timer_period *= rte_get_timer_hz();
> > +#endif /* ENABLE_STATS */
> > +
> >  	/* initialize event helper configuration */
> >  	eh_conf = eh_conf_init();
> >  	if (eh_conf == NULL)
> > diff --git a/examples/ipsec-secgw/ipsec-secgw.h
> > b/examples/ipsec-secgw/ipsec-secgw.h
> > index 4b53cb5..d886a35 100644
> > --- a/examples/ipsec-secgw/ipsec-secgw.h
> > +++ b/examples/ipsec-secgw/ipsec-secgw.h
> > @@ -6,6 +6,8 @@
> >
> >  #include <stdbool.h>
> >
> > +//#define ENABLE_STATS
> > +
> 
> Should be removed I think.
> 
> >  #define NB_SOCKETS 4
> >
> >  #define MAX_PKT_BURST 32
> > diff --git a/examples/ipsec-secgw/ipsec.c
> > b/examples/ipsec-secgw/ipsec.c index bf88d80..dcb9312 100644
> > --- a/examples/ipsec-secgw/ipsec.c
> > +++ b/examples/ipsec-secgw/ipsec.c
> > @@ -499,8 +499,10 @@ enqueue_cop_burst(struct cdev_qp *cqp)
> >  			" enqueued %u crypto ops out of %u\n",
> >  			cqp->id, cqp->qp, ret, len);
> >  			/* drop packets that we fail to enqueue */
> > -			for (i = ret; i < len; i++)
> > +			for (i = ret; i < len; i++) {
> > +				core_stats_update_drop(1);
> >  				rte_pktmbuf_free(cqp->buf[i]->sym->m_src);
> > +			}
> >  	}
> >  	cqp->in_flight += ret;
> >  	cqp->len = 0;
> > @@ -528,6 +530,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct
> > ipsec_ctx *ipsec_ctx,
> >
> >  	for (i = 0; i < nb_pkts; i++) {
> >  		if (unlikely(sas[i] == NULL)) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(pkts[i]);
> >  			continue;
> >  		}
> > @@ -549,6 +552,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct
> > ipsec_ctx *ipsec_ctx,
> >
> >  			if ((unlikely(ips->security.ses == NULL)) &&
> >  				create_lookaside_session(ipsec_ctx, sa, ips)) {
> > +				core_stats_update_drop(1);
> >  				rte_pktmbuf_free(pkts[i]);
> >  				continue;
> >  			}
> > @@ -563,6 +567,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct
> ipsec_ctx *ipsec_ctx,
> >  		case RTE_SECURITY_ACTION_TYPE_CPU_CRYPTO:
> >  			RTE_LOG(ERR, IPSEC, "CPU crypto is not supported by
> the"
> >  					" legacy mode.");
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(pkts[i]);
> >  			continue;
> >
> > @@ -575,6 +580,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct
> > ipsec_ctx *ipsec_ctx,
> >
> >  			if ((unlikely(ips->crypto.ses == NULL)) &&
> >  				create_lookaside_session(ipsec_ctx, sa, ips)) {
> > +				core_stats_update_drop(1);
> >  				rte_pktmbuf_free(pkts[i]);
> >  				continue;
> >  			}
> > @@ -584,6 +590,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct
> > ipsec_ctx *ipsec_ctx,
> >
> >  			ret = xform_func(pkts[i], sa, &priv->cop);
> >  			if (unlikely(ret)) {
> > +				core_stats_update_drop(1);
> >  				rte_pktmbuf_free(pkts[i]);
> >  				continue;
> >  			}
> > @@ -608,6 +615,7 @@ ipsec_enqueue(ipsec_xform_fn xform_func, struct
> > ipsec_ctx *ipsec_ctx,
> >
> >  			ret = xform_func(pkts[i], sa, &priv->cop);
> >  			if (unlikely(ret)) {
> > +				core_stats_update_drop(1);
> >  				rte_pktmbuf_free(pkts[i]);
> >  				continue;
> >  			}
> > @@ -643,6 +651,7 @@ ipsec_inline_dequeue(ipsec_xform_fn xform_func,
> struct ipsec_ctx *ipsec_ctx,
> >  		sa = priv->sa;
> >  		ret = xform_func(pkt, sa, &priv->cop);
> >  		if (unlikely(ret)) {
> > +			core_stats_update_drop(1);
> >  			rte_pktmbuf_free(pkt);
> >  			continue;
> >  		}
> > @@ -690,12 +699,14 @@ ipsec_dequeue(ipsec_xform_fn xform_func, struct
> ipsec_ctx *ipsec_ctx,
> >  				RTE_SECURITY_ACTION_TYPE_NONE) {
> >  				ret = xform_func(pkt, sa, cops[j]);
> >  				if (unlikely(ret)) {
> > +					core_stats_update_drop(1);
> >  					rte_pktmbuf_free(pkt);
> >  					continue;
> >  				}
> >  			} else if (ipsec_get_action_type(sa) ==
> >
> 	RTE_SECURITY_ACTION_TYPE_LOOKASIDE_PROTOCOL) {
> >  				if (cops[j]->status) {
> > +					core_stats_update_drop(1);
> >  					rte_pktmbuf_free(pkt);
> >  					continue;
> >  				}
> > diff --git a/examples/ipsec-secgw/ipsec.h
> > b/examples/ipsec-secgw/ipsec.h index 1e642d1..8519eab 100644
> > --- a/examples/ipsec-secgw/ipsec.h
> > +++ b/examples/ipsec-secgw/ipsec.h
> > @@ -46,6 +46,17 @@
> >
> >  #define IP6_VERSION (6)
> >
> > +#ifdef ENABLE_STATS
> > +struct ipsec_core_statistics {
> > +	uint64_t tx;
> > +	uint64_t rx;
> > +	uint64_t dropped;
> > +	uint64_t burst_rx;
> 
> A bit strange to have burst_rx and no similar counterpart for tx.
> BTW, do you need burst_rx?
> Might be better:
> nb_calls_rx, nb_calls_tx;
> and then: rx/nb_calls_rx will give you average burst size.

[Anoob] After the packets are received in the app, further IPsec processing is done, which is more efficient when done in batches. For example, SA lookup is better when done in a batch, as SIMD instructions would be used. Ability of traffic to be in burst or ability of ethdev to give burst will have an effect on the IPsec performance per core. Tx burst percentage isn't that important, since it is the app which submits for Tx.

Rx/nb_calls_rx would also give a similar kind of stat, but the difference is whether 1 packet cases and 31 packet cases are treated equal. I would prefer to keep it the current way, but what you suggested also would work.
 
> 
> > +} __rte_cache_aligned;
> > +
> > +struct ipsec_core_statistics core_statistics[RTE_MAX_ETHPORTS];
> 
> Should be RTE_MAX_LCORES, I think.

[Anoob] Ack

> 
> > +#endif /* ENABLE_STATS */
> > +
> >  struct rte_crypto_xform;
> >  struct ipsec_xform;
> >  struct rte_mbuf;
> > @@ -416,4 +427,15 @@ check_flow_params(uint16_t fdir_portid, uint8_t
> > fdir_qid);  int  create_ipsec_esp_flow(struct ipsec_sa *sa);
> >
> > +static inline void
> > +core_stats_update_drop(int n)
> > +{
> > +#ifdef ENABLE_STATS
> > +	int lcore_id = rte_lcore_id();
> > +	core_statistics[lcore_id].dropped += n; #else
> > +	RTE_SET_USED(n);
> > +#endif /* ENABLE_STATS */
> > +}
> > +
> >  #endif /* __IPSEC_H__ */
> > diff --git a/examples/ipsec-secgw/ipsec_process.c
> > b/examples/ipsec-secgw/ipsec_process.c
> > index bb2f2b8..05cb3ad 100644
> > --- a/examples/ipsec-secgw/ipsec_process.c
> > +++ b/examples/ipsec-secgw/ipsec_process.c
> > @@ -24,6 +24,11 @@ free_pkts(struct rte_mbuf *mb[], uint32_t n)  {
> >  	uint32_t i;
> >
> > +#ifdef ENABLE_STATS
> > +	int lcore_id = rte_lcore_id();
> > +	core_statistics[lcore_id].dropped += n; #endif /* ENABLE_STATS */
> > +
> 
> Same as above - why not use stats_update_drop() here?
> 
> >  	for (i = 0; i != n; i++)
> >  		rte_pktmbuf_free(mb[i]);
> >  }
> > --
> > 2.7.4
  
Ananyev, Konstantin May 5, 2020, 10:45 a.m. UTC | #3
> > > @@ -1159,6 +1243,19 @@ ipsec_poll_mode_worker(void)
> > >  			drain_tx_buffers(qconf);
> > >  			drain_crypto_buffers(qconf);
> > >  			prev_tsc = cur_tsc;
> > > +#ifdef ENABLE_STATS
> > > +			if (lcore_id == rte_get_master_lcore()) {
> > > +				/* advance the timer */
> > > +				timer_tsc += diff_tsc;
> > > +
> > > +				/* if timer has reached its timeout */
> > > +				if (unlikely(timer_tsc >= timer_period)) {
> > > +					print_stats();
> > > +					/* reset the timer */
> > > +					timer_tsc = 0;
> > > +				}
> > > +			}
> > > +#endif /* ENABLE_STATS */
> >
> >
> > Why to do stats collection/display inside data-path?
> > Why not use rte_timer/rte_alarm and make it happen in control thread?
> > Another option - make stats printing at some signal to the process.
> > In that case we don't need to bother with time period at all - it will be user to
> > decide.
> > Again if we remove that print_stats() from data-path it might become cheap
> > enough to always collect it, and we will not need ENABLE_STATS macro at all.
> 
> [Anoob] The stats collection will not be cheap because we are updating a common structure from multiple cores.

What I am saying - per-lcore stats collection (updating lcore stats by lcore itself) - should be cheap enough
and can stay in data-path (probably even always, not only when particular compile flag is enabled). 
Global stats collection and display is quite expensive operation,
so we shouldn't keep it in data-path.
We have a control thread within dpdk (for alarms, signals, timers, etc.), 
why not to call print_stats from there?

> And the idea is not to
> have a very efficient stats mechanism. When we have more cores engaged, IPsec performance is directly dependent on ability of RSS to
> split traffic to multiple cores. Such stats would come handy in understanding per core distribution of incoming traffic.
> 
> >
  

Patch

diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 6d02341..db92ddc 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -288,6 +288,61 @@  adjust_ipv6_pktlen(struct rte_mbuf *m, const struct rte_ipv6_hdr *iph,
 	}
 }
 
+#ifdef ENABLE_STATS
+static uint64_t timer_period = 10; /* default period is 10 seconds */
+
+/* Print out statistics on packet distribution */
+static void
+print_stats(void)
+{
+	uint64_t total_packets_dropped, total_packets_tx, total_packets_rx;
+	unsigned int coreid;
+	float burst_percent;
+
+	total_packets_dropped = 0;
+	total_packets_tx = 0;
+	total_packets_rx = 0;
+
+	const char clr[] = { 27, '[', '2', 'J', '\0' };
+	const char topLeft[] = { 27, '[', '1', ';', '1', 'H', '\0' };
+
+	/* Clear screen and move to top left */
+	printf("%s%s", clr, topLeft);
+
+	printf("\nCore statistics ====================================");
+
+	for (coreid = 0; coreid < RTE_MAX_LCORE; coreid++) {
+		/* skip disabled cores */
+		if (rte_lcore_is_enabled(coreid) == 0)
+			continue;
+		burst_percent = (float)(core_statistics[coreid].burst_rx * 100)/
+					core_statistics[coreid].rx;
+		printf("\nStatistics for core %u ------------------------------"
+			   "\nPackets received: %20"PRIu64
+			   "\nPackets sent: %24"PRIu64
+			   "\nPackets dropped: %21"PRIu64
+			   "\nBurst percent: %23.2f",
+			   coreid,
+			   core_statistics[coreid].rx,
+			   core_statistics[coreid].tx,
+			   core_statistics[coreid].dropped,
+			   burst_percent);
+
+		total_packets_dropped += core_statistics[coreid].dropped;
+		total_packets_tx += core_statistics[coreid].tx;
+		total_packets_rx += core_statistics[coreid].rx;
+	}
+	printf("\nAggregate statistics ==============================="
+		   "\nTotal packets received: %14"PRIu64
+		   "\nTotal packets sent: %18"PRIu64
+		   "\nTotal packets dropped: %15"PRIu64,
+		   total_packets_rx,
+		   total_packets_tx,
+		   total_packets_dropped);
+	printf("\n====================================================\n");
+}
+#endif /* ENABLE_STATS */
+
 static inline void
 prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t)
 {
@@ -333,6 +388,7 @@  prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t)
 
 		/* drop packet when IPv6 header exceeds first segment length */
 		if (unlikely(l3len > pkt->data_len)) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(pkt);
 			return;
 		}
@@ -350,6 +406,7 @@  prepare_one_packet(struct rte_mbuf *pkt, struct ipsec_traffic *t)
 		/* Unknown/Unsupported type, drop the packet */
 		RTE_LOG(ERR, IPSEC, "Unsupported packet type 0x%x\n",
 			rte_be_to_cpu_16(eth->ether_type));
+		core_stats_update_drop(1);
 		rte_pktmbuf_free(pkt);
 		return;
 	}
@@ -471,6 +528,11 @@  send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
 	int32_t ret;
 	uint16_t queueid;
 
+#ifdef ENABLE_STATS
+	int lcore_id = rte_lcore_id();
+	core_statistics[lcore_id].tx += n;
+#endif /* ENABLE_STATS */
+
 	queueid = qconf->tx_queue_id[port];
 	m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
 
@@ -478,6 +540,9 @@  send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
 
 	ret = rte_eth_tx_burst(port, queueid, m_table, n);
 	if (unlikely(ret < n)) {
+#ifdef ENABLE_STATS
+		core_statistics[lcore_id].dropped += n-ret;
+#endif /* ENABLE_STATS */
 		do {
 			rte_pktmbuf_free(m_table[ret]);
 		} while (++ret < n);
@@ -525,6 +590,7 @@  send_fragment_packet(struct lcore_conf *qconf, struct rte_mbuf *m,
 			"error code: %d\n",
 			__func__, m->pkt_len, rte_errno);
 
+	core_stats_update_drop(1);
 	rte_pktmbuf_free(m);
 	return len;
 }
@@ -549,8 +615,10 @@  send_single_packet(struct rte_mbuf *m, uint16_t port, uint8_t proto)
 	/* need to fragment the packet */
 	} else if (frag_tbl_sz > 0)
 		len = send_fragment_packet(qconf, m, port, proto);
-	else
+	else {
+		core_stats_update_drop(1);
 		rte_pktmbuf_free(m);
+	}
 
 	/* enough pkts to be sent */
 	if (unlikely(len == MAX_PKT_BURST)) {
@@ -584,18 +652,21 @@  inbound_sp_sa(struct sp_ctx *sp, struct sa_ctx *sa, struct traffic_type *ip,
 			continue;
 		}
 		if (res == DISCARD) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(m);
 			continue;
 		}
 
 		/* Only check SPI match for processed IPSec packets */
 		if (i < lim && ((m->ol_flags & PKT_RX_SEC_OFFLOAD) == 0)) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(m);
 			continue;
 		}
 
 		sa_idx = res - 1;
 		if (!inbound_sa_check(sa, m, sa_idx)) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(m);
 			continue;
 		}
@@ -630,8 +701,10 @@  split46_traffic(struct ipsec_traffic *trf, struct rte_mbuf *mb[], uint32_t num)
 					uint8_t *,
 					offsetof(struct ip6_hdr, ip6_nxt));
 			n6++;
-		} else
+		} else {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(m);
+		}
 	}
 
 	trf->ip4.num = n4;
@@ -682,11 +755,12 @@  outbound_sp(struct sp_ctx *sp, struct traffic_type *ip,
 	for (i = 0; i < ip->num; i++) {
 		m = ip->pkts[i];
 		sa_idx = ip->res[i] - 1;
-		if (ip->res[i] == DISCARD)
+		if (ip->res[i] == DISCARD) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(m);
-		else if (ip->res[i] == BYPASS)
+		} else if (ip->res[i] == BYPASS) {
 			ip->pkts[j++] = m;
-		else {
+		} else {
 			ipsec->res[ipsec->num] = sa_idx;
 			ipsec->pkts[ipsec->num++] = m;
 		}
@@ -705,6 +779,8 @@  process_pkts_outbound(struct ipsec_ctx *ipsec_ctx,
 	for (i = 0; i < traffic->ipsec.num; i++)
 		rte_pktmbuf_free(traffic->ipsec.pkts[i]);
 
+	core_stats_update_drop(traffic->ipsec.num);
+
 	traffic->ipsec.num = 0;
 
 	outbound_sp(ipsec_ctx->sp4_ctx, &traffic->ip4, &traffic->ipsec);
@@ -745,12 +821,14 @@  process_pkts_inbound_nosp(struct ipsec_ctx *ipsec_ctx,
 	/* Drop any IPv4 traffic from unprotected ports */
 	for (i = 0; i < traffic->ip4.num; i++)
 		rte_pktmbuf_free(traffic->ip4.pkts[i]);
+	core_stats_update_drop(traffic->ip4.num);
 
 	traffic->ip4.num = 0;
 
 	/* Drop any IPv6 traffic from unprotected ports */
 	for (i = 0; i < traffic->ip6.num; i++)
 		rte_pktmbuf_free(traffic->ip6.pkts[i]);
+	core_stats_update_drop(traffic->ip6.num);
 
 	traffic->ip6.num = 0;
 
@@ -788,6 +866,7 @@  process_pkts_outbound_nosp(struct ipsec_ctx *ipsec_ctx,
 	/* Drop any IPsec traffic from protected ports */
 	for (i = 0; i < traffic->ipsec.num; i++)
 		rte_pktmbuf_free(traffic->ipsec.pkts[i]);
+	core_stats_update_drop(traffic->ipsec.num);
 
 	n = 0;
 
@@ -901,6 +980,7 @@  route4_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf *pkts[], uint8_t nb_pkts)
 		}
 
 		if ((pkt_hop & RTE_LPM_LOOKUP_SUCCESS) == 0) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(pkts[i]);
 			continue;
 		}
@@ -953,6 +1033,7 @@  route6_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf *pkts[], uint8_t nb_pkts)
 		}
 
 		if (pkt_hop == -1) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(pkts[i]);
 			continue;
 		}
@@ -1099,6 +1180,9 @@  ipsec_poll_mode_worker(void)
 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1)
 			/ US_PER_S * BURST_TX_DRAIN_US;
 	struct lcore_rx_queue *rxql;
+#ifdef ENABLE_STATS
+	uint64_t timer_tsc = 0;
+#endif /* ENABLE_STATS */
 
 	prev_tsc = 0;
 	lcore_id = rte_lcore_id();
@@ -1159,6 +1243,19 @@  ipsec_poll_mode_worker(void)
 			drain_tx_buffers(qconf);
 			drain_crypto_buffers(qconf);
 			prev_tsc = cur_tsc;
+#ifdef ENABLE_STATS
+			if (lcore_id == rte_get_master_lcore()) {
+				/* advance the timer */
+				timer_tsc += diff_tsc;
+
+				/* if timer has reached its timeout */
+				if (unlikely(timer_tsc >= timer_period)) {
+					print_stats();
+					/* reset the timer */
+					timer_tsc = 0;
+				}
+			}
+#endif /* ENABLE_STATS */
 		}
 
 		for (i = 0; i < qconf->nb_rx_queue; ++i) {
@@ -1169,6 +1266,12 @@  ipsec_poll_mode_worker(void)
 			nb_rx = rte_eth_rx_burst(portid, queueid,
 					pkts, MAX_PKT_BURST);
 
+#ifdef ENABLE_STATS
+			core_statistics[lcore_id].rx += nb_rx;
+			if (nb_rx == MAX_PKT_BURST)
+				core_statistics[lcore_id].burst_rx += nb_rx;
+#endif /* ENABLE_STATS */
+
 			if (nb_rx > 0)
 				process_pkts(qconf, pkts, nb_rx, portid);
 
@@ -2747,6 +2850,11 @@  main(int32_t argc, char **argv)
 	signal(SIGINT, signal_handler);
 	signal(SIGTERM, signal_handler);
 
+#ifdef ENABLE_STATS
+	/* convert to number of cycles */
+	timer_period *= rte_get_timer_hz();
+#endif /* ENABLE_STATS */
+
 	/* initialize event helper configuration */
 	eh_conf = eh_conf_init();
 	if (eh_conf == NULL)
diff --git a/examples/ipsec-secgw/ipsec-secgw.h b/examples/ipsec-secgw/ipsec-secgw.h
index 4b53cb5..d886a35 100644
--- a/examples/ipsec-secgw/ipsec-secgw.h
+++ b/examples/ipsec-secgw/ipsec-secgw.h
@@ -6,6 +6,8 @@ 
 
 #include <stdbool.h>
 
+//#define ENABLE_STATS
+
 #define NB_SOCKETS 4
 
 #define MAX_PKT_BURST 32
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index bf88d80..dcb9312 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -499,8 +499,10 @@  enqueue_cop_burst(struct cdev_qp *cqp)
 			" enqueued %u crypto ops out of %u\n",
 			cqp->id, cqp->qp, ret, len);
 			/* drop packets that we fail to enqueue */
-			for (i = ret; i < len; i++)
+			for (i = ret; i < len; i++) {
+				core_stats_update_drop(1);
 				rte_pktmbuf_free(cqp->buf[i]->sym->m_src);
+			}
 	}
 	cqp->in_flight += ret;
 	cqp->len = 0;
@@ -528,6 +530,7 @@  ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 
 	for (i = 0; i < nb_pkts; i++) {
 		if (unlikely(sas[i] == NULL)) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(pkts[i]);
 			continue;
 		}
@@ -549,6 +552,7 @@  ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 
 			if ((unlikely(ips->security.ses == NULL)) &&
 				create_lookaside_session(ipsec_ctx, sa, ips)) {
+				core_stats_update_drop(1);
 				rte_pktmbuf_free(pkts[i]);
 				continue;
 			}
@@ -563,6 +567,7 @@  ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 		case RTE_SECURITY_ACTION_TYPE_CPU_CRYPTO:
 			RTE_LOG(ERR, IPSEC, "CPU crypto is not supported by the"
 					" legacy mode.");
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(pkts[i]);
 			continue;
 
@@ -575,6 +580,7 @@  ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 
 			if ((unlikely(ips->crypto.ses == NULL)) &&
 				create_lookaside_session(ipsec_ctx, sa, ips)) {
+				core_stats_update_drop(1);
 				rte_pktmbuf_free(pkts[i]);
 				continue;
 			}
@@ -584,6 +590,7 @@  ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 
 			ret = xform_func(pkts[i], sa, &priv->cop);
 			if (unlikely(ret)) {
+				core_stats_update_drop(1);
 				rte_pktmbuf_free(pkts[i]);
 				continue;
 			}
@@ -608,6 +615,7 @@  ipsec_enqueue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 
 			ret = xform_func(pkts[i], sa, &priv->cop);
 			if (unlikely(ret)) {
+				core_stats_update_drop(1);
 				rte_pktmbuf_free(pkts[i]);
 				continue;
 			}
@@ -643,6 +651,7 @@  ipsec_inline_dequeue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 		sa = priv->sa;
 		ret = xform_func(pkt, sa, &priv->cop);
 		if (unlikely(ret)) {
+			core_stats_update_drop(1);
 			rte_pktmbuf_free(pkt);
 			continue;
 		}
@@ -690,12 +699,14 @@  ipsec_dequeue(ipsec_xform_fn xform_func, struct ipsec_ctx *ipsec_ctx,
 				RTE_SECURITY_ACTION_TYPE_NONE) {
 				ret = xform_func(pkt, sa, cops[j]);
 				if (unlikely(ret)) {
+					core_stats_update_drop(1);
 					rte_pktmbuf_free(pkt);
 					continue;
 				}
 			} else if (ipsec_get_action_type(sa) ==
 				RTE_SECURITY_ACTION_TYPE_LOOKASIDE_PROTOCOL) {
 				if (cops[j]->status) {
+					core_stats_update_drop(1);
 					rte_pktmbuf_free(pkt);
 					continue;
 				}
diff --git a/examples/ipsec-secgw/ipsec.h b/examples/ipsec-secgw/ipsec.h
index 1e642d1..8519eab 100644
--- a/examples/ipsec-secgw/ipsec.h
+++ b/examples/ipsec-secgw/ipsec.h
@@ -46,6 +46,17 @@ 
 
 #define IP6_VERSION (6)
 
+#ifdef ENABLE_STATS
+struct ipsec_core_statistics {
+	uint64_t tx;
+	uint64_t rx;
+	uint64_t dropped;
+	uint64_t burst_rx;
+} __rte_cache_aligned;
+
+struct ipsec_core_statistics core_statistics[RTE_MAX_ETHPORTS];
+#endif /* ENABLE_STATS */
+
 struct rte_crypto_xform;
 struct ipsec_xform;
 struct rte_mbuf;
@@ -416,4 +427,15 @@  check_flow_params(uint16_t fdir_portid, uint8_t fdir_qid);
 int
 create_ipsec_esp_flow(struct ipsec_sa *sa);
 
+static inline void
+core_stats_update_drop(int n)
+{
+#ifdef ENABLE_STATS
+	int lcore_id = rte_lcore_id();
+	core_statistics[lcore_id].dropped += n;
+#else
+	RTE_SET_USED(n);
+#endif /* ENABLE_STATS */
+}
+
 #endif /* __IPSEC_H__ */
diff --git a/examples/ipsec-secgw/ipsec_process.c b/examples/ipsec-secgw/ipsec_process.c
index bb2f2b8..05cb3ad 100644
--- a/examples/ipsec-secgw/ipsec_process.c
+++ b/examples/ipsec-secgw/ipsec_process.c
@@ -24,6 +24,11 @@  free_pkts(struct rte_mbuf *mb[], uint32_t n)
 {
 	uint32_t i;
 
+#ifdef ENABLE_STATS
+	int lcore_id = rte_lcore_id();
+	core_statistics[lcore_id].dropped += n;
+#endif /* ENABLE_STATS */
+
 	for (i = 0; i != n; i++)
 		rte_pktmbuf_free(mb[i]);
 }