[dpdk-dev,v4,2/3] eal: add u64 bit variant for reciprocal

Message ID 1504608532-18598-2-git-send-email-pbhagavatula@caviumnetworks.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Pavan Nikhilesh Sept. 5, 2017, 10:48 a.m. UTC
  From: Pavan Bhagavatula <pbhagavatula@caviumnetworks.com>

Currently, rte_reciprocal only supports unsigned 32bit divisors. This
commit adds support for unsigned 64bit divisors.

Rename unsigned 32bit specific functions appropriately and update
librte_sched accordingly.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@caviumnetworks.com>
---
 lib/librte_eal/bsdapp/eal/rte_eal_version.map   |   3 +-
 lib/librte_eal/common/include/rte_reciprocal.h  | 111 +++++++++++++++++++++--
 lib/librte_eal/common/rte_reciprocal.c          | 116 +++++++++++++++++++++---
 lib/librte_eal/linuxapp/eal/rte_eal_version.map |   3 +-
 lib/librte_sched/Makefile                       |   4 +-
 lib/librte_sched/rte_sched.c                    |   9 +-
 6 files changed, 220 insertions(+), 26 deletions(-)
  

Comments

Stephen Hemminger Sept. 5, 2017, 5:29 p.m. UTC | #1
On Tue,  5 Sep 2017 16:18:51 +0530
Pavan Nikhilesh <pbhagavatula@caviumnetworks.com> wrote:

> +/**
> + * Unsigned 32-bit divisor structure.
> + */
> +struct rte_reciprocal_u32 {
>  	uint32_t m;
>  	uint8_t sh1, sh2;
> -};
> +} __rte_cache_aligned;
> +
> +/**
> + * Unsigned 64-bit divisor structure.
> + */
> +struct rte_reciprocal_u64 {
> +	uint64_t m;
> +	uint8_t sh1;
> +} __rte_cache_aligned;

I understand you want to squeeze every cycle out but it is not
required that each of these structures always be cache aligned.

They maybe embedded in other structures and having the structure
padded so that these elements are cache aligned would take up
more space and make cache performance worse.

Better off to not put attributes on the structure definitions, and instead
let usages of this feature align where appropriate.
  
Pavan Nikhilesh Sept. 6, 2017, 4:32 a.m. UTC | #2
On Tue, Sep 05, 2017 at 10:29:01AM -0700, Stephen Hemminger wrote:
> On Tue,  5 Sep 2017 16:18:51 +0530
> Pavan Nikhilesh <pbhagavatula@caviumnetworks.com> wrote:
>
> > +/**
> > + * Unsigned 32-bit divisor structure.
> > + */
> > +struct rte_reciprocal_u32 {
> >  	uint32_t m;
> >  	uint8_t sh1, sh2;
> > -};
> > +} __rte_cache_aligned;
> > +
> > +/**
> > + * Unsigned 64-bit divisor structure.
> > + */
> > +struct rte_reciprocal_u64 {
> > +	uint64_t m;
> > +	uint8_t sh1;
> > +} __rte_cache_aligned;
>
> I understand you want to squeeze every cycle out but it is not
> required that each of these structures always be cache aligned.
>
> They maybe embedded in other structures and having the structure
> padded so that these elements are cache aligned would take up
> more space and make cache performance worse.
>
> Better off to not put attributes on the structure definitions, and instead
> let usages of this feature align where appropriate.
>

Agreed, will remove cache alignment in the next version (v6).

Thanks,
Pavan.
  

Patch

diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index 90d7258..59a85bb 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -241,6 +241,7 @@  EXPERIMENTAL {
 DPDK_17.11 {
 	global:
 
-	rte_reciprocal_value;
+	rte_reciprocal_value_u32;
+	rte_reciprocal_value_u64;
 
 } DPDK_17.08;
diff --git a/lib/librte_eal/common/include/rte_reciprocal.h b/lib/librte_eal/common/include/rte_reciprocal.h
index b6d752f..801d1c8 100644
--- a/lib/librte_eal/common/include/rte_reciprocal.h
+++ b/lib/librte_eal/common/include/rte_reciprocal.h
@@ -22,22 +22,117 @@ 
 #ifndef _RTE_RECIPROCAL_H_
 #define _RTE_RECIPROCAL_H_
 
-#include <stdint.h>
+#include <rte_memory.h>
 
-struct rte_reciprocal {
+/**
+ * Unsigned 32-bit divisor structure.
+ */
+struct rte_reciprocal_u32 {
 	uint32_t m;
 	uint8_t sh1, sh2;
-};
+} __rte_cache_aligned;
+
+/**
+ * Unsigned 64-bit divisor structure.
+ */
+struct rte_reciprocal_u64 {
+	uint64_t m;
+	uint8_t sh1;
+} __rte_cache_aligned;
 
+/**
+ * Divide given unsigned 32-bit integer with pre calculated divisor.
+ *
+ * @param a
+ *   The 32-bit dividend.
+ * @param R
+ *   The pointer to pre calculated divisor reciprocal structure.
+ *
+ * @return
+ *   The result of the division
+ */
 static inline uint32_t
-rte_reciprocal_divide(uint32_t a, struct rte_reciprocal R)
+rte_reciprocal_divide_u32(uint32_t a, struct rte_reciprocal_u32 *R)
+{
+	uint32_t t = (((uint64_t)a * R->m) >> 32);
+
+	return (t + ((a - t) >> R->sh1)) >> R->sh2;
+}
+
+static inline uint64_t
+mullhi_u64(uint64_t x, uint64_t y)
+{
+#ifdef __SIZEOF_INT128__
+	__uint128_t xl = x;
+	__uint128_t rl = xl * y;
+
+	return (rl >> 64);
+#else
+	uint64_t u0, u1, v0, v1, k, t;
+	uint64_t w1, w2;
+	uint64_t whi;
+
+	u1 = x >> 32; u0 = x & 0xFFFFFFFF;
+	v1 = y >> 32; v0 = y & 0xFFFFFFFF;
+
+	t = u0*v0;
+	k = t >> 32;
+
+	t = u1*v0 + k;
+	w1 = t & 0xFFFFFFFF;
+	w2 = t >> 32;
+
+	t = u0*v1 + w1;
+	k = t >> 32;
+
+	whi = u1*v1 + w2 + k;
+
+	return whi;
+#endif
+}
+
+/**
+ * Divide given unsigned 64-bit integer with pre calculated divisor.
+ *
+ * @param a
+ *   The 64-bit dividend.
+ * @param R
+ *   The pointer to pre calculated divisor reciprocal structure.
+ *
+ * @return
+ *   The result of the division
+ */
+static inline uint64_t
+rte_reciprocal_divide_u64(uint64_t a, struct rte_reciprocal_u64 *R)
 {
-	uint32_t t = (uint32_t)(((uint64_t)a * R.m) >> 32);
+	uint64_t q = mullhi_u64(R->m, a);
+	uint64_t t = ((a - q) >> 1) + q;
 
-	return (t + ((a - t) >> R.sh1)) >> R.sh2;
+	return t >> R->sh1;
 }
 
-struct rte_reciprocal
-rte_reciprocal_value(uint32_t d);
+/**
+ * Generate pre calculated divisor structure.
+ *
+ * @param d
+ *   The unsigned 32-bit divisor.
+ *
+ * @return
+ *   Divisor structure.
+ */
+struct rte_reciprocal_u32
+rte_reciprocal_value_u32(uint32_t d);
+
+/**
+ * Generate pre calculated divisor structure.
+ *
+ * @param d
+ *   The unsigned 64-bit divisor.
+ *
+ * @return
+ *   Divisor structure.
+ */
+struct rte_reciprocal_u64
+rte_reciprocal_value_u64(uint64_t d);
 
 #endif /* _RTE_RECIPROCAL_H_ */
diff --git a/lib/librte_eal/common/rte_reciprocal.c b/lib/librte_eal/common/rte_reciprocal.c
index 7ab99b4..2024e62 100644
--- a/lib/librte_eal/common/rte_reciprocal.c
+++ b/lib/librte_eal/common/rte_reciprocal.c
@@ -31,18 +31,13 @@ 
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <stdio.h>
-#include <stdint.h>
-
-#include <rte_common.h>
-
-#include "rte_reciprocal.h"
+#include <rte_reciprocal.h>
 
 /* find largest set bit.
  * portable and slow but does not matter for this usage.
  */
 static inline int
-fls(uint32_t x)
+fls_u32(uint32_t x)
 {
 	int b;
 
@@ -54,14 +49,14 @@  fls(uint32_t x)
 	return 0;
 }
 
-struct rte_reciprocal
-rte_reciprocal_value(uint32_t d)
+struct rte_reciprocal_u32
+rte_reciprocal_value_u32(uint32_t d)
 {
-	struct rte_reciprocal R;
+	struct rte_reciprocal_u32 R;
 	uint64_t m;
 	int l;
 
-	l = fls(d - 1);
+	l = fls_u32(d - 1);
 	m = ((1ULL << 32) * ((1ULL << l) - d));
 	m /= d;
 
@@ -72,3 +67,102 @@  rte_reciprocal_value(uint32_t d)
 
 	return R;
 }
+
+/* Code taken from Hacker's Delight:
+ * http://www.hackersdelight.org/HDcode/divlu.c.
+ * License permits inclusion here per:
+ * http://www.hackersdelight.org/permissions.htm
+ */
+static inline uint64_t
+divide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r)
+{
+	const uint64_t b = (1ULL << 32); /* Number base (16 bits). */
+	uint64_t un1, un0,           /* Norm. dividend LSD's. */
+			 vn1, vn0,           /* Norm. divisor digits. */
+			 q1, q0,             /* Quotient digits. */
+			 un64, un21, un10,   /* Dividend digit pairs. */
+			 rhat;               /* A remainder. */
+	int s;                       /* Shift amount for norm. */
+
+    /* If overflow, set rem. to an impossible value. */
+	if (u1 >= v) {
+		if (r != NULL)
+			*r = (uint64_t) -1;
+		return (uint64_t) -1;
+	}
+
+	/* Count leading zeros. */
+	s = __builtin_clzll(v);
+	if (s > 0) {
+		v = v << s;
+		un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
+		un10 = u0 << s;
+	} else {
+
+		un64 = u1 | u0;
+		un10 = u0;
+	}
+
+	vn1 = v >> 32;
+	vn0 = v & 0xFFFFFFFF;
+
+	un1 = un10 >> 32;
+	un0 = un10 & 0xFFFFFFFF;
+
+	q1 = un64/vn1;
+	rhat = un64 - q1*vn1;
+again1:
+	if (q1 >= b || q1*vn0 > b*rhat + un1) {
+		q1 = q1 - 1;
+		rhat = rhat + vn1;
+		if (rhat < b)
+			goto again1;
+	}
+
+	un21 = un64*b + un1 - q1*v;
+
+	q0 = un21/vn1;
+	rhat = un21 - q0*vn1;
+again2:
+	if (q0 >= b || q0*vn0 > b*rhat + un0) {
+		q0 = q0 - 1;
+		rhat = rhat + vn1;
+		if (rhat < b)
+			goto again2;
+	}
+
+	if (r != NULL)
+		*r = (un21*b + un0 - q0*v) >> s;
+	return q1*b + q0;
+}
+
+struct rte_reciprocal_u64
+rte_reciprocal_value_u64(uint64_t d)
+{
+	struct rte_reciprocal_u64 R;
+
+	const uint32_t fld = 63 - __builtin_clzll(d);
+
+	if ((d & (d - 1)) == 0) {
+		R.m = 0;
+		R.sh1 = (fld - 1) | 0x40;
+	} else {
+		uint64_t rem;
+		uint64_t multiplier;
+		uint8_t more;
+
+		multiplier = divide_128_div_64_to_64(1ULL << fld, 0, d, &rem);
+		multiplier += multiplier;
+
+		const uint64_t twice_rem = rem + rem;
+		if (twice_rem >= d || twice_rem < rem)
+			multiplier += 1;
+		more = fld;
+		R.m = 1 + multiplier;
+		R.sh1 = more | 0x40;
+	}
+
+	R.sh1 &= 0x3F;
+
+	return R;
+}
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 2070cba..2671627 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -246,6 +246,7 @@  EXPERIMENTAL {
 DPDK_17.11 {
 	global:
 
-	rte_reciprocal_value;
+	rte_reciprocal_value_u32;
+	rte_reciprocal_value_u64;
 
 } DPDK_17.08;
diff --git a/lib/librte_sched/Makefile b/lib/librte_sched/Makefile
index 569656b..a2fd6f3 100644
--- a/lib/librte_sched/Makefile
+++ b/lib/librte_sched/Makefile
@@ -54,6 +54,8 @@  LIBABIVER := 1
 SRCS-$(CONFIG_RTE_LIBRTE_SCHED) += rte_sched.c rte_red.c rte_approx.c
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_SCHED)-include := rte_sched.h rte_bitmap.h rte_sched_common.h rte_red.h rte_approx.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_SCHED)-include := rte_sched.h rte_bitmap.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_SCHED)-include += rte_sched_common.h rte_red.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_SCHED)-include += rte_approx.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
index 3b8ccaa..7bb6d51 100644
--- a/lib/librte_sched/rte_sched.c
+++ b/lib/librte_sched/rte_sched.c
@@ -228,7 +228,7 @@  struct rte_sched_port {
 	uint64_t time_cpu_cycles;     /* Current CPU time measured in CPU cyles */
 	uint64_t time_cpu_bytes;      /* Current CPU time measured in bytes */
 	uint64_t time;                /* Current NIC TX time measured in bytes */
-	struct rte_reciprocal inv_cycles_per_byte; /* CPU cycles per byte */
+	struct rte_reciprocal_u32 inv_cycles_per_byte; /* CPU cycles per byte */
 
 	/* Scheduling loop detection */
 	uint32_t pipe_loop;
@@ -677,7 +677,7 @@  rte_sched_port_config(struct rte_sched_port_params *params)
 
 	cycles_per_byte = (rte_get_tsc_hz() << RTE_SCHED_TIME_SHIFT)
 		/ params->rate;
-	port->inv_cycles_per_byte = rte_reciprocal_value(cycles_per_byte);
+	port->inv_cycles_per_byte = rte_reciprocal_value_u32(cycles_per_byte);
 
 	/* Scheduling loop detection */
 	port->pipe_loop = RTE_SCHED_PIPE_INVALID;
@@ -2147,8 +2147,9 @@  rte_sched_port_time_resync(struct rte_sched_port *port)
 	uint64_t bytes_diff;
 
 	/* Compute elapsed time in bytes */
-	bytes_diff = rte_reciprocal_divide(cycles_diff << RTE_SCHED_TIME_SHIFT,
-					   port->inv_cycles_per_byte);
+	bytes_diff = rte_reciprocal_divide_u32(
+			cycles_diff << RTE_SCHED_TIME_SHIFT,
+			&port->inv_cycles_per_byte);
 
 	/* Advance port time */
 	port->time_cpu_cycles = cycles;