[dpdk-dev] baseband/turbo_sw: offload cost measurement test

Message ID 20180404140602.9344-2-kamilx.chalupnik@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Kamil Chalupnik April 4, 2018, 2:05 p.m. UTC
  From: "Chalupnik, KamilX" <kamilx.chalupnik@intel.com>

New test created to measure offload cost.
Changes were introduced in API, turbo software driver
and test application

Signed-off-by: KamilX Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c                 | 333 ++++++++++++++++++-----
 drivers/baseband/turbo_sw/bbdev_turbo_software.c | 102 ++++---
 lib/librte_bbdev/rte_bbdev.h                     |   4 +
 3 files changed, 333 insertions(+), 106 deletions(-)
  

Comments

Mokhtar, Amr April 13, 2018, 11:18 p.m. UTC | #1
> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Wednesday 4 April 2018 15:06
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; Chalupnik, KamilX
> <kamilx.chalupnik@intel.com>
> Subject: [PATCH] baseband/turbo_sw: offload cost measurement test
> 
> From: "Chalupnik, KamilX" <kamilx.chalupnik@intel.com>
> 
> New test created to measure offload cost.
> Changes were introduced in API, turbo software driver
> and test application
> 
> Signed-off-by: KamilX Chalupnik <kamilx.chalupnik@intel.com>

Acked-by: Amr Mokhtar <amr.mokhtar@intel.com>
  
Mokhtar, Amr April 13, 2018, 11:37 p.m. UTC | #2
> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Wednesday 4 April 2018 15:06
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; Chalupnik, KamilX
> <kamilx.chalupnik@intel.com>
> Subject: [PATCH] baseband/turbo_sw: offload cost measurement test
> 
> From: "Chalupnik, KamilX" <kamilx.chalupnik@intel.com>
> 
> New test created to measure offload cost.
> Changes were introduced in API, turbo software driver
> and test application
> 
> Signed-off-by: KamilX Chalupnik <kamilx.chalupnik@intel.com>



> b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
> index 302abf5..70691f3 100644
> --- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
> +++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
> @@ -9,6 +9,7 @@
>  #include <rte_malloc.h>
>  #include <rte_ring.h>
>  #include <rte_kvargs.h>
> +#include <rte_cycles.h>
> 
>  #include <rte_bbdev.h>
>  #include <rte_bbdev_pmd.h>
> @@ -20,18 +21,6 @@
> 
>  #define DRIVER_NAME turbo_sw
> 
> -/* Turbo SW PMD logging ID */
> -static int bbdev_turbo_sw_logtype;
> -
> -/* Helper macro for logging */
> -#define rte_bbdev_log(level, fmt, ...) \
> -	rte_log(RTE_LOG_ ## level, bbdev_turbo_sw_logtype, fmt "\n", \
> -		##__VA_ARGS__)
> -
> -#define rte_bbdev_log_debug(fmt, ...) \
> -	rte_bbdev_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
> -		##__VA_ARGS__)
> -

Why deleting logging macros?
This prevents turbo_sw driver from building. 
Please ignore my previous Ack, it was set to this patch wrongly..

Thanks,
Amr
  

Patch

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index 00f3b08..be2e20c 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -83,6 +83,28 @@  struct thread_params {
 	struct test_op_params *op_params;
 };
 
+/* Stores time statistics */
+struct test_time_stats {
+	/* Stores software enqueue total working time */
+	uint64_t enq_sw_tot_time;
+	/* Stores minimum value of software enqueue working time */
+	uint64_t enq_sw_min_time;
+	/* Stores maximum value of software enqueue working time */
+	uint64_t enq_sw_max_time;
+	/* Stores turbo enqueue total working time */
+	uint64_t enq_tur_tot_time;
+	/* Stores minimum value of turbo enqueue working time */
+	uint64_t enq_tur_min_time;
+	/* Stores maximum value of turbo enqueue working time */
+	uint64_t enq_tur_max_time;
+	/* Stores dequeue total working time */
+	uint64_t deq_tot_time;
+	/* Stores minimum value of dequeue working time */
+	uint64_t deq_min_time;
+	/* Stores maximum value of dequeue working time */
+	uint64_t deq_max_time;
+};
+
 typedef int (test_case_function)(struct active_device *ad,
 		struct test_op_params *op_params);
 
@@ -1104,7 +1126,6 @@  dequeue_event_callback(uint16_t dev_id,
 	double in_len;
 
 	struct thread_params *tp = cb_arg;
-
 	RTE_SET_USED(ret_param);
 	queue_id = tp->queue_id;
 
@@ -1649,20 +1670,21 @@  throughput_test(struct active_device *ad,
 }
 
 static int
-operation_latency_test_dec(struct rte_mempool *mempool,
+latency_test_dec(struct rte_mempool *mempool,
 		struct test_buffers *bufs, struct rte_bbdev_dec_op *ref_op,
 		int vector_mask, uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *total_time)
+		uint64_t *total_time, uint64_t *min_time, uint64_t *max_time)
 {
 	int ret = TEST_SUCCESS;
 	uint16_t i, j, dequeued;
 	struct rte_bbdev_dec_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t start_time = 0;
+	uint64_t start_time = 0, last_time = 0;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
 		bool first_time = true;
+		last_time = 0;
 
 		if (unlikely(num_to_process - dequeued < burst_sz))
 			burst_sz = num_to_process - dequeued;
@@ -1692,11 +1714,15 @@  operation_latency_test_dec(struct rte_mempool *mempool,
 			deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
 			if (likely(first_time && (deq > 0))) {
-				*total_time += rte_rdtsc_precise() - start_time;
+				last_time = rte_rdtsc_precise() - start_time;
 				first_time = false;
 			}
 		} while (unlikely(burst_sz != deq));
 
+		*max_time = RTE_MAX(*max_time, last_time);
+		*min_time = RTE_MIN(*min_time, last_time);
+		*total_time += last_time;
+
 		if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
 			ret = validate_dec_op(ops_deq, burst_sz, ref_op,
 					vector_mask);
@@ -1711,20 +1737,21 @@  operation_latency_test_dec(struct rte_mempool *mempool,
 }
 
 static int
-operation_latency_test_enc(struct rte_mempool *mempool,
+latency_test_enc(struct rte_mempool *mempool,
 		struct test_buffers *bufs, struct rte_bbdev_enc_op *ref_op,
 		uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *total_time)
+		uint64_t *total_time, uint64_t *min_time, uint64_t *max_time)
 {
 	int ret = TEST_SUCCESS;
 	uint16_t i, j, dequeued;
 	struct rte_bbdev_enc_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t start_time = 0;
+	uint64_t start_time = 0, last_time = 0;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
 		bool first_time = true;
+		last_time = 0;
 
 		if (unlikely(num_to_process - dequeued < burst_sz))
 			burst_sz = num_to_process - dequeued;
@@ -1753,11 +1780,15 @@  operation_latency_test_enc(struct rte_mempool *mempool,
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
 			if (likely(first_time && (deq > 0))) {
-				*total_time += rte_rdtsc_precise() - start_time;
+				last_time += rte_rdtsc_precise() - start_time;
 				first_time = false;
 			}
 		} while (unlikely(burst_sz != deq));
 
+		*max_time = RTE_MAX(*max_time, last_time);
+		*min_time = RTE_MIN(*min_time, last_time);
+		*total_time += last_time;
+
 		if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
 			ret = validate_enc_op(ops_deq, burst_sz, ref_op);
 			TEST_ASSERT_SUCCESS(ret, "Validation failed!");
@@ -1771,7 +1802,7 @@  operation_latency_test_enc(struct rte_mempool *mempool,
 }
 
 static int
-operation_latency_test(struct active_device *ad,
+latency_test(struct active_device *ad,
 		struct test_op_params *op_params)
 {
 	int iter;
@@ -1781,9 +1812,12 @@  operation_latency_test(struct active_device *ad,
 	const uint16_t queue_id = ad->queue_ids[0];
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
-	uint64_t total_time = 0;
+	uint64_t total_time, min_time, max_time;
 	const char *op_type_str;
 
+	total_time = max_time = 0;
+	min_time = UINT64_MAX;
+
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
@@ -1798,36 +1832,65 @@  operation_latency_test(struct active_device *ad,
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
-		iter = operation_latency_test_dec(op_params->mp, bufs,
+		iter = latency_test_dec(op_params->mp, bufs,
 				op_params->ref_dec_op, op_params->vector_mask,
 				ad->dev_id, queue_id, num_to_process,
-				burst_sz, &total_time);
+				burst_sz, &total_time, &min_time, &max_time);
 	else
-		iter = operation_latency_test_enc(op_params->mp, bufs,
+		iter = latency_test_enc(op_params->mp, bufs,
 				op_params->ref_enc_op, ad->dev_id, queue_id,
-				num_to_process, burst_sz, &total_time);
+				num_to_process, burst_sz, &total_time,
+				&min_time, &max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\toperation avg. latency: %lg cycles, %lg us\n",
+	printf("\toperation latency:\n"
+			"\t\tavg latency: %lg cycles, %lg us\n"
+			"\t\tmin latency: %lg cycles, %lg us\n"
+			"\t\tmax latency: %lg cycles, %lg us\n",
 			(double)total_time / (double)iter,
 			(double)(total_time * 1000000) / (double)iter /
+			(double)rte_get_tsc_hz(), (double)min_time,
+			(double)(min_time * 1000000) / (double)rte_get_tsc_hz(),
+			(double)max_time, (double)(max_time * 1000000) /
 			(double)rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
 
 static int
+get_bbdev_queue_stats(uint16_t dev_id, uint16_t queue_id,
+		struct rte_bbdev_stats *stats)
+{
+	struct rte_bbdev *dev = &rte_bbdev_devices[dev_id];
+	struct rte_bbdev_stats *q_stats;
+
+	if (queue_id >= dev->data->num_queues)
+		return -1;
+
+	q_stats = &dev->data->queues[queue_id].queue_stats;
+
+	stats->enqueued_count = q_stats->enqueued_count;
+	stats->dequeued_count = q_stats->dequeued_count;
+	stats->enqueue_err_count = q_stats->enqueue_err_count;
+	stats->dequeue_err_count = q_stats->dequeue_err_count;
+	stats->turbo_perf_time = q_stats->turbo_perf_time;
+
+	return 0;
+}
+
+static int
 offload_latency_test_dec(struct rte_mempool *mempool, struct test_buffers *bufs,
 		struct rte_bbdev_dec_op *ref_op, uint16_t dev_id,
 		uint16_t queue_id, const uint16_t num_to_process,
-		uint16_t burst_sz, uint64_t *enq_total_time,
-		uint64_t *deq_total_time)
+		uint16_t burst_sz, struct test_time_stats *time_st)
 {
-	int i, dequeued;
+	int i, dequeued, ret;
 	struct rte_bbdev_dec_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
 	uint64_t enq_start_time, deq_start_time;
+	uint64_t enq_sw_last_time, deq_last_time;
+	struct rte_bbdev_stats stats;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
@@ -1843,24 +1906,54 @@  offload_latency_test_dec(struct rte_mempool *mempool, struct test_buffers *bufs,
 					bufs->soft_outputs,
 					ref_op);
 
-		/* Start time measurment for enqueue function offload latency */
-		enq_start_time = rte_rdtsc();
+		/* Start time meas for enqueue function offload latency */
+		enq_start_time = rte_rdtsc_precise();
 		do {
 			enq += rte_bbdev_enqueue_dec_ops(dev_id, queue_id,
 					&ops_enq[enq], burst_sz - enq);
 		} while (unlikely(burst_sz != enq));
-		*enq_total_time += rte_rdtsc() - enq_start_time;
+
+		ret = get_bbdev_queue_stats(dev_id, queue_id, &stats);
+		TEST_ASSERT_SUCCESS(ret,
+				"Failed to get stats for queue (%u) of device (%u)",
+				queue_id, dev_id);
+
+		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
+				stats.turbo_perf_time;
+		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
+				enq_sw_last_time);
+		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
+				enq_sw_last_time);
+		time_st->enq_sw_tot_time += enq_sw_last_time;
+
+		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_tot_time += stats.turbo_perf_time;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
 
-		/* Start time measurment for dequeue function offload latency */
-		deq_start_time = rte_rdtsc();
+		/* Start time meas for dequeue function offload latency */
+		deq_start_time = rte_rdtsc_precise();
+		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
+					&ops_deq[deq], 1);
+		} while (unlikely(deq != 1));
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
+				deq_last_time);
+		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
+				deq_last_time);
+		time_st->deq_tot_time += deq_last_time;
+
+		/* Dequeue remaining operations if needed*/
+		while (burst_sz != deq)
+			deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
-		} while (unlikely(burst_sz != deq));
-		*deq_total_time += rte_rdtsc() - deq_start_time;
 
 		rte_bbdev_dec_op_free_bulk(ops_enq, deq);
 		dequeued += deq;
@@ -1873,12 +1966,13 @@  static int
 offload_latency_test_enc(struct rte_mempool *mempool, struct test_buffers *bufs,
 		struct rte_bbdev_enc_op *ref_op, uint16_t dev_id,
 		uint16_t queue_id, const uint16_t num_to_process,
-		uint16_t burst_sz, uint64_t *enq_total_time,
-		uint64_t *deq_total_time)
+		uint16_t burst_sz, struct test_time_stats *time_st)
 {
-	int i, dequeued;
+	int i, dequeued, ret;
 	struct rte_bbdev_enc_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
 	uint64_t enq_start_time, deq_start_time;
+	uint64_t enq_sw_last_time, deq_last_time;
+	struct rte_bbdev_stats stats;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
@@ -1893,24 +1987,53 @@  offload_latency_test_enc(struct rte_mempool *mempool, struct test_buffers *bufs,
 					bufs->hard_outputs,
 					ref_op);
 
-		/* Start time measurment for enqueue function offload latency */
-		enq_start_time = rte_rdtsc();
+		/* Start time meas for enqueue function offload latency */
+		enq_start_time = rte_rdtsc_precise();
 		do {
 			enq += rte_bbdev_enqueue_enc_ops(dev_id, queue_id,
 					&ops_enq[enq], burst_sz - enq);
 		} while (unlikely(burst_sz != enq));
-		*enq_total_time += rte_rdtsc() - enq_start_time;
+
+		ret = get_bbdev_queue_stats(dev_id, queue_id, &stats);
+		TEST_ASSERT_SUCCESS(ret,
+				"Failed to get stats for queue (%u) of device (%u)",
+				queue_id, dev_id);
+
+		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
+				stats.turbo_perf_time;
+		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
+				enq_sw_last_time);
+		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
+				enq_sw_last_time);
+		time_st->enq_sw_tot_time += enq_sw_last_time;
+
+		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_tot_time += stats.turbo_perf_time;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
 
-		/* Start time measurment for dequeue function offload latency */
-		deq_start_time = rte_rdtsc();
+		/* Start time meas for dequeue function offload latency */
+		deq_start_time = rte_rdtsc_precise();
+		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
+					&ops_deq[deq], 1);
+		} while (unlikely(deq != 1));
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
+				deq_last_time);
+		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
+				deq_last_time);
+		time_st->deq_tot_time += deq_last_time;
+
+		while (burst_sz != deq)
+			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
-		} while (unlikely(burst_sz != deq));
-		*deq_total_time += rte_rdtsc() - deq_start_time;
 
 		rte_bbdev_enc_op_free_bulk(ops_enq, deq);
 		dequeued += deq;
@@ -1920,11 +2043,10 @@  offload_latency_test_enc(struct rte_mempool *mempool, struct test_buffers *bufs,
 }
 
 static int
-offload_latency_test(struct active_device *ad,
+offload_cost_test(struct active_device *ad,
 		struct test_op_params *op_params)
 {
 	int iter;
-	uint64_t enq_total_time = 0, deq_total_time = 0;
 	uint16_t burst_sz = op_params->burst_sz;
 	const uint16_t num_to_process = op_params->num_to_process;
 	const enum rte_bbdev_op_type op_type = test_vector.op_type;
@@ -1932,6 +2054,12 @@  offload_latency_test(struct active_device *ad,
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
 	const char *op_type_str;
+	struct test_time_stats time_st;
+
+	memset(&time_st, 0, sizeof(struct test_time_stats));
+	time_st.enq_sw_min_time = UINT64_MAX;
+	time_st.enq_tur_min_time = UINT64_MAX;
+	time_st.deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1949,26 +2077,51 @@  offload_latency_test(struct active_device *ad,
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
 		iter = offload_latency_test_dec(op_params->mp, bufs,
 				op_params->ref_dec_op, ad->dev_id, queue_id,
-				num_to_process, burst_sz, &enq_total_time,
-				&deq_total_time);
+				num_to_process, burst_sz, &time_st);
 	else
 		iter = offload_latency_test_enc(op_params->mp, bufs,
 				op_params->ref_enc_op, ad->dev_id, queue_id,
-				num_to_process, burst_sz, &enq_total_time,
-				&deq_total_time);
+				num_to_process, burst_sz, &time_st);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tenq offload avg. latency: %lg cycles, %lg us\n",
-			(double)enq_total_time / (double)iter,
-			(double)(enq_total_time * 1000000) / (double)iter /
-			(double)rte_get_tsc_hz());
-
-	printf("\tdeq offload avg. latency: %lg cycles, %lg us\n",
-			(double)deq_total_time / (double)iter,
-			(double)(deq_total_time * 1000000) / (double)iter /
-			(double)rte_get_tsc_hz());
+	printf("\tenq offload cost latency:\n"
+			"\t\tsoftware avg %lg cycles, %lg us\n"
+			"\t\tsoftware min %lg cycles, %lg us\n"
+			"\t\tsoftware max %lg cycles, %lg us\n"
+			"\t\tturbo avg %lg cycles, %lg us\n"
+			"\t\tturbo min %lg cycles, %lg us\n"
+			"\t\tturbo max %lg cycles, %lg us\n",
+			(double)time_st.enq_sw_tot_time / (double)iter,
+			(double)(time_st.enq_sw_tot_time * 1000000) /
+			(double)iter / (double)rte_get_tsc_hz(),
+			(double)time_st.enq_sw_min_time,
+			(double)(time_st.enq_sw_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_sw_max_time,
+			(double)(time_st.enq_sw_max_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_tur_tot_time /
+			(double)iter,
+			(double)(time_st.enq_tur_tot_time * 1000000) /
+			(double)iter / (double)rte_get_tsc_hz(),
+			(double)time_st.enq_tur_min_time,
+			(double)(time_st.enq_tur_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_tur_max_time,
+			(double)(time_st.enq_tur_max_time * 1000000) /
+			rte_get_tsc_hz());
+
+	printf("\tdeq offload cost latency - one op:\n"
+			"\t\tavg %lg cycles, %lg us\n"
+			"\t\tmin %lg cycles, %lg us\n"
+			"\t\tmax %lg cycles, %lg us\n",
+			(double)time_st.deq_tot_time / (double)iter,
+			(double)(time_st.deq_tot_time * 1000000) /
+			(double)iter / (double)rte_get_tsc_hz(),
+			(double)time_st.deq_min_time,
+			(double)(time_st.deq_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.deq_max_time,
+			(double)(time_st.deq_max_time * 1000000) /
+			rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
@@ -1976,21 +2129,28 @@  offload_latency_test(struct active_device *ad,
 static int
 offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_total_time)
+		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_max_time)
 {
 	int i, deq_total;
 	struct rte_bbdev_dec_op *ops[MAX_BURST];
-	uint64_t deq_start_time;
+	uint64_t deq_start_time, deq_last_time;
 
 	/* Test deq offload latency from an empty queue */
-	deq_start_time = rte_rdtsc_precise();
+
 	for (i = 0, deq_total = 0; deq_total < num_to_process;
 			++i, deq_total += burst_sz) {
+		deq_start_time = rte_rdtsc_precise();
+
 		if (unlikely(num_to_process - deq_total < burst_sz))
 			burst_sz = num_to_process - deq_total;
 		rte_bbdev_dequeue_dec_ops(dev_id, queue_id, ops, burst_sz);
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
+		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
+		*deq_tot_time += deq_last_time;
 	}
-	*deq_total_time = rte_rdtsc_precise() - deq_start_time;
 
 	return i;
 }
@@ -1998,21 +2158,27 @@  offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 static int
 offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_total_time)
+		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_max_time)
 {
 	int i, deq_total;
 	struct rte_bbdev_enc_op *ops[MAX_BURST];
-	uint64_t deq_start_time;
+	uint64_t deq_start_time, deq_last_time;
 
 	/* Test deq offload latency from an empty queue */
-	deq_start_time = rte_rdtsc_precise();
 	for (i = 0, deq_total = 0; deq_total < num_to_process;
 			++i, deq_total += burst_sz) {
+		deq_start_time = rte_rdtsc_precise();
+
 		if (unlikely(num_to_process - deq_total < burst_sz))
 			burst_sz = num_to_process - deq_total;
 		rte_bbdev_dequeue_enc_ops(dev_id, queue_id, ops, burst_sz);
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
+		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
+		*deq_tot_time += deq_last_time;
 	}
-	*deq_total_time = rte_rdtsc_precise() - deq_start_time;
 
 	return i;
 }
@@ -2022,7 +2188,7 @@  offload_latency_empty_q_test(struct active_device *ad,
 		struct test_op_params *op_params)
 {
 	int iter;
-	uint64_t deq_total_time = 0;
+	uint64_t deq_tot_time, deq_min_time, deq_max_time;
 	uint16_t burst_sz = op_params->burst_sz;
 	const uint16_t num_to_process = op_params->num_to_process;
 	const enum rte_bbdev_op_type op_type = test_vector.op_type;
@@ -2030,6 +2196,9 @@  offload_latency_empty_q_test(struct active_device *ad,
 	struct rte_bbdev_info info;
 	const char *op_type_str;
 
+	deq_tot_time = deq_max_time = 0;
+	deq_min_time = UINT64_MAX;
+
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
@@ -2044,18 +2213,26 @@  offload_latency_empty_q_test(struct active_device *ad,
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
 		iter = offload_latency_empty_q_test_dec(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_total_time);
+				num_to_process, burst_sz, &deq_tot_time,
+				&deq_min_time, &deq_max_time);
 	else
 		iter = offload_latency_empty_q_test_enc(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_total_time);
+				num_to_process, burst_sz, &deq_tot_time,
+				&deq_min_time, &deq_max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tempty deq offload avg. latency: %lg cycles, %lg us\n",
-			(double)deq_total_time / (double)iter,
-			(double)(deq_total_time * 1000000) / (double)iter /
-			(double)rte_get_tsc_hz());
+	printf("\tempty deq offload\n"
+			"\t\tavg. latency: %lg cycles, %lg us\n"
+			"\t\tmin. latency: %lg cycles, %lg us\n"
+			"\t\tmax. latency: %lg cycles, %lg us\n",
+			(double)deq_tot_time / (double)iter,
+			(double)(deq_tot_time * 1000000) / (double)iter /
+			(double)rte_get_tsc_hz(), (double)deq_min_time,
+			(double)(deq_min_time * 1000000) / rte_get_tsc_hz(),
+			(double)deq_max_time, (double)(deq_max_time * 1000000) /
+			rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
@@ -2067,9 +2244,9 @@  throughput_tc(void)
 }
 
 static int
-offload_latency_tc(void)
+offload_cost_tc(void)
 {
-	return run_test_case(offload_latency_test);
+	return run_test_case(offload_cost_test);
 }
 
 static int
@@ -2079,9 +2256,9 @@  offload_latency_empty_q_tc(void)
 }
 
 static int
-operation_latency_tc(void)
+latency_tc(void)
 {
-	return run_test_case(operation_latency_test);
+	return run_test_case(latency_test);
 }
 
 static int
@@ -2105,7 +2282,7 @@  static struct unit_test_suite bbdev_validation_testsuite = {
 	.setup = testsuite_setup,
 	.teardown = testsuite_teardown,
 	.unit_test_cases = {
-		TEST_CASE_ST(ut_setup, ut_teardown, operation_latency_tc),
+		TEST_CASE_ST(ut_setup, ut_teardown, latency_tc),
 		TEST_CASES_END() /**< NULL terminate unit test array */
 	}
 };
@@ -2115,9 +2292,18 @@  static struct unit_test_suite bbdev_latency_testsuite = {
 	.setup = testsuite_setup,
 	.teardown = testsuite_teardown,
 	.unit_test_cases = {
-		TEST_CASE_ST(ut_setup, ut_teardown, offload_latency_tc),
+		TEST_CASE_ST(ut_setup, ut_teardown, latency_tc),
+		TEST_CASES_END() /**< NULL terminate unit test array */
+	}
+};
+
+static struct unit_test_suite bbdev_offload_cost_testsuite = {
+	.suite_name = "BBdev Offload Cost Tests",
+	.setup = testsuite_setup,
+	.teardown = testsuite_teardown,
+	.unit_test_cases = {
+		TEST_CASE_ST(ut_setup, ut_teardown, offload_cost_tc),
 		TEST_CASE_ST(ut_setup, ut_teardown, offload_latency_empty_q_tc),
-		TEST_CASE_ST(ut_setup, ut_teardown, operation_latency_tc),
 		TEST_CASES_END() /**< NULL terminate unit test array */
 	}
 };
@@ -2135,4 +2321,5 @@  static struct unit_test_suite bbdev_interrupt_testsuite = {
 REGISTER_TEST_COMMAND(throughput, bbdev_throughput_testsuite);
 REGISTER_TEST_COMMAND(validation, bbdev_validation_testsuite);
 REGISTER_TEST_COMMAND(latency, bbdev_latency_testsuite);
+REGISTER_TEST_COMMAND(offload, bbdev_offload_cost_testsuite);
 REGISTER_TEST_COMMAND(interrupt, bbdev_interrupt_testsuite);
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 302abf5..70691f3 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -9,6 +9,7 @@ 
 #include <rte_malloc.h>
 #include <rte_ring.h>
 #include <rte_kvargs.h>
+#include <rte_cycles.h>
 
 #include <rte_bbdev.h>
 #include <rte_bbdev_pmd.h>
@@ -20,18 +21,6 @@ 
 
 #define DRIVER_NAME turbo_sw
 
-/* Turbo SW PMD logging ID */
-static int bbdev_turbo_sw_logtype;
-
-/* Helper macro for logging */
-#define rte_bbdev_log(level, fmt, ...) \
-	rte_log(RTE_LOG_ ## level, bbdev_turbo_sw_logtype, fmt "\n", \
-		##__VA_ARGS__)
-
-#define rte_bbdev_log_debug(fmt, ...) \
-	rte_bbdev_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
-		##__VA_ARGS__)
-
 /* Number of columns in sub-block interleaver (36.212, section 5.1.4.1.1) */
 #define C_SUBBLOCK (32)
 #define MAX_TB_SIZE (391656)
@@ -454,7 +443,8 @@  static inline void
 process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		uint8_t cb_idx, uint8_t c, uint16_t k, uint16_t ncb,
 		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out,
-		uint16_t in_offset, uint16_t out_offset, uint16_t total_left)
+		uint16_t in_offset, uint16_t out_offset, uint16_t total_left,
+		struct rte_bbdev_stats *q_stats)
 {
 	int ret;
 	int16_t k_idx;
@@ -462,10 +452,16 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 	uint8_t *in, *out0, *out1, *out2, *tmp_out, *rm_out;
 	struct rte_bbdev_op_turbo_enc *enc = &op->turbo_enc;
 	struct bblib_crc_request crc_req;
+	struct bblib_crc_response crc_resp;
 	struct bblib_turbo_encoder_request turbo_req;
 	struct bblib_turbo_encoder_response turbo_resp;
 	struct bblib_rate_match_dl_request rm_req;
 	struct bblib_rate_match_dl_response rm_resp;
+#ifdef RTE_TEST_BBDEV
+	uint64_t start_time;
+#else
+	RTE_SET_USED(q_stats);
+#endif
 
 	k_idx = compute_idx(k);
 	in = rte_pktmbuf_mtod_offset(m_in, uint8_t *, in_offset);
@@ -482,13 +478,20 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		 * it by 3 CRC bytes
 		 */
 		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
-		crc_req.data = q->enc_in;
+		crc_req.data = in;
 		crc_req.len = (k - 24) >> 3;
-		if (bblib_lte_crc24a_gen(&crc_req) == -1) {
-			op->status |= 1 << RTE_BBDEV_CRC_ERROR;
-			rte_bbdev_log(ERR, "CRC24a generation failed");
-			return;
-		}
+		crc_resp.data = q->enc_in;
+
+#ifdef RTE_TEST_BBDEV
+		start_time = rte_rdtsc_precise();
+#endif
+
+		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
+
+#ifdef RTE_TEST_BBDEV
+		q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 		in = q->enc_in;
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
@@ -501,13 +504,20 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		 * it by 3 CRC bytes
 		 */
 		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
-		crc_req.data = q->enc_in;
+		crc_req.data = in;
 		crc_req.len = (k - 24) >> 3;
-		if (bblib_lte_crc24b_gen(&crc_req) == -1) {
-			op->status |= 1 << RTE_BBDEV_CRC_ERROR;
-			rte_bbdev_log(ERR, "CRC24b generation failed");
-			return;
-		}
+		crc_resp.data = q->enc_in;
+
+#ifdef RTE_TEST_BBDEV
+		start_time = rte_rdtsc_precise();
+#endif
+
+		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
+
+#ifdef RTE_TEST_BBDEV
+		q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 		in = q->enc_in;
 	} else {
 		ret = is_enc_input_valid(k, k_idx, total_left);
@@ -533,12 +543,21 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 	turbo_resp.output_win_0 = out0;
 	turbo_resp.output_win_1 = out1;
 	turbo_resp.output_win_2 = out2;
+
+#ifdef RTE_TEST_BBDEV
+	start_time = rte_rdtsc_precise();
+#endif
+
 	if (bblib_turbo_encoder(&turbo_req, &turbo_resp) != 0) {
 		op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 		rte_bbdev_log(ERR, "Turbo Encoder failed");
 		return;
 	}
 
+#ifdef RTE_TEST_BBDEV
+	q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 	/* Rate-matching */
 	if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH) {
 		/* get output data starting address */
@@ -588,11 +607,20 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		else
 			rm_req.bypass_rvidx = 0;
 
+#ifdef RTE_TEST_BBDEV
+		start_time = rte_rdtsc_precise();
+#endif
+
 		if (bblib_rate_match_dl(&rm_req, &rm_resp) != 0) {
 			op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 			rte_bbdev_log(ERR, "Rate matching failed");
 			return;
 		}
+
+#ifdef RTE_TEST_BBDEV
+		q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 		enc->output.length += rm_resp.OutputLen;
 	} else {
 		/* Rate matching is bypassed */
@@ -637,7 +665,8 @@  process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 }
 
 static inline void
-enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op)
+enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
+		struct rte_bbdev_stats *queue_stats)
 {
 	uint8_t c, r, crc24_bits = 0;
 	uint16_t k, ncb;
@@ -692,7 +721,8 @@  enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op)
 		}
 
 		process_enc_cb(q, op, r, c, k, ncb, e, m_in,
-				m_out, in_offset, out_offset, total_left);
+				m_out, in_offset, out_offset, total_left,
+				queue_stats);
 		/* Update total_left */
 		total_left -= (k - crc24_bits) >> 3;
 		/* Update offsets for next CBs (if exist) */
@@ -714,12 +744,15 @@  enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op)
 
 static inline uint16_t
 enqueue_enc_all_ops(struct turbo_sw_queue *q, struct rte_bbdev_enc_op **ops,
-		uint16_t nb_ops)
+		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
 {
 	uint16_t i;
+#ifdef RTE_TEST_BBDEV
+	queue_stats->turbo_perf_time = 0;
+#endif
 
 	for (i = 0; i < nb_ops; ++i)
-		enqueue_enc_one_op(q, ops[i]);
+		enqueue_enc_one_op(q, ops[i], queue_stats);
 
 	return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops,
 			NULL);
@@ -898,6 +931,8 @@  process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 	turbo_req.k = k;
 	turbo_req.k_idx = k_idx;
 	turbo_req.max_iter_num = dec->iter_max;
+	turbo_req.early_term_disable = !check_bit(dec->op_flags,
+			RTE_BBDEV_TURBO_EARLY_TERMINATION);
 	turbo_resp.ag_buf = q->ag;
 	turbo_resp.cb_buf = q->code_block;
 	turbo_resp.output = out;
@@ -1004,7 +1039,7 @@  enqueue_enc_ops(struct rte_bbdev_queue_data *q_data,
 	struct turbo_sw_queue *q = queue;
 	uint16_t nb_enqueued = 0;
 
-	nb_enqueued = enqueue_enc_all_ops(q, ops, nb_ops);
+	nb_enqueued = enqueue_enc_all_ops(q, ops, nb_ops, &q_data->queue_stats);
 
 	q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued;
 	q_data->queue_stats.enqueued_count += nb_enqueued;
@@ -1207,11 +1242,12 @@  RTE_PMD_REGISTER_PARAM_STRING(DRIVER_NAME,
 	TURBO_SW_MAX_NB_QUEUES_ARG"=<int> "
 	TURBO_SW_SOCKET_ID_ARG"=<int>");
 
+int bbdev_logtype;
 RTE_INIT(null_bbdev_init_log);
 static void
 null_bbdev_init_log(void)
 {
-	bbdev_turbo_sw_logtype = rte_log_register("pmd.bb.turbo_sw");
-	if (bbdev_turbo_sw_logtype >= 0)
-		rte_log_set_level(bbdev_turbo_sw_logtype, RTE_LOG_NOTICE);
+	bbdev_logtype = rte_log_register("pmd.bbdev.turbo_sw");
+	if (bbdev_logtype >= 0)
+		rte_log_set_level(bbdev_logtype, RTE_LOG_NOTICE);
 }
diff --git a/lib/librte_bbdev/rte_bbdev.h b/lib/librte_bbdev/rte_bbdev.h
index 5e7e495..395acf6 100644
--- a/lib/librte_bbdev/rte_bbdev.h
+++ b/lib/librte_bbdev/rte_bbdev.h
@@ -239,6 +239,10 @@  struct rte_bbdev_stats {
 	uint64_t enqueue_err_count;
 	/** Total error count on operations dequeued */
 	uint64_t dequeue_err_count;
+#ifdef RTE_TEST_BBDEV
+	/** It stores turbo decoder/encoder working time. */
+	uint64_t turbo_perf_time;
+#endif
 };
 
 /**