[dpdk-dev] [PATCH v2 3/4] app/test: add performance autotest for rte_memset

Zhiyong Yang zhiyong.yang at intel.com
Tue Dec 27 11:04:57 CET 2016


The file implements the perf autotest for rte_memset. The perf data
can be gotten compared between rte_memset and memset when you run it.
We can see the perf of rte_memset obviously is better than glibc memset
especially for small N bytes.
The first column shows the N size for memset & rte_memset.
The second column lists a set of numbers for rte_memset Vs memset perf
in cache.
The third column lists a set of numbers for rte_memset Vs memset perf
in memory.

The following data is gotten on haswell. 

** rte_memset() - memset perf tests
        (C = compile-time constant) **
======== ======= ======== ======= ========
   Size memset in cache  memset in mem
(bytes)        (ticks)        (ticks)
------- -------------- ---------------
============= 32B aligned ================
      1       3 -    8      14 -  115
      3       4 -    8      19 -  125
      6       3 -    7      19 -  125
      8       3 -    6      19 -  124
     12       3 -    6      19 -  124
     15       3 -    6      19 -  125
     16       3 -    8      13 -  125
     32       3 -    7      19 -  133
     64       3 -    7      28 -  162
     65       6 -    8      41 -  182
    128       6 -   13      54 -  199
    192       8 -   13      77 -  273
    255       8 -   16     100 -  222
    512      17 -   14     187 -  247
    768      22 -   20     270 -  362
   1024      29 -   28     329 -  377
   2048      63 -   57     564 -  601
   4096     104 -  102     993 - 1025
   8192     200 -  211    1831 - 2270
------ -------------- -------------- ------
C     6       2 -    2      19 -   19
C    64       2 -    6      28 -   33
C   128       3 -   12      54 -   59
C   192       5 -   29      77 -   83
C   256       6 -   35     100 -  105
C   512      12 -   60     188 -  195
C   768      18 -   20     271 -  362
C  1024      24 -   29     329 -  377

Signed-off-by: Zhiyong Yang <zhiyong.yang at intel.com>
---

Change in V2:

Add perf comparation data between rte_memset and memset on haswell.

 app/test/Makefile           |   1 +
 app/test/test_memset_perf.c | 348 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 349 insertions(+)
 create mode 100644 app/test/test_memset_perf.c

diff --git a/app/test/Makefile b/app/test/Makefile
index 82da3f3..1c3e7f1 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -124,6 +124,7 @@ SRCS-y += test_memcpy.c
 SRCS-y += test_memcpy_perf.c
 
 SRCS-y += test_memset.c
+SRCS-y += test_memset_perf.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_thash.c
diff --git a/app/test/test_memset_perf.c b/app/test/test_memset_perf.c
new file mode 100644
index 0000000..83b15b5
--- /dev/null
+++ b/app/test/test_memset_perf.c
@@ -0,0 +1,348 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_malloc.h>
+#include <rte_memset.h>
+#include "test.h"
+
+/*
+ * Set this to the maximum buffer size you want to test. If it is 0, then the
+ * values in the buf_sizes[] array below will be used.
+ */
+#define TEST_VALUE_RANGE        0
+
+/* List of buffer sizes to test */
+#if TEST_VALUE_RANGE == 0
+static size_t buf_sizes[] = {
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65,
+	70, 85, 96, 105, 115, 127, 128, 129, 161, 191, 192, 193, 255, 256,
+	257, 319, 320, 321, 383, 384, 385, 447, 448, 449, 511, 512, 513,
+	767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600, 2048, 2560,
+	3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
+};
+/* MUST be as large as largest packet size above */
+#define SMALL_BUFFER_SIZE 8192
+#else /* TEST_VALUE_RANGE != 0 */
+static size_t buf_sizes[TEST_VALUE_RANGE];
+#define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
+#endif /* TEST_VALUE_RANGE == 0 */
+
+/*
+ * Arrays of this size are used for measuring uncached memory accesses by
+ * picking a random location within the buffer. Make this smaller if there are
+ * memory allocation errors.
+ */
+#define LARGE_BUFFER_SIZE       (100 * 1024 * 1024)
+
+/* How many times to run timing loop for performance tests */
+#define TEST_ITERATIONS         1000000
+#define TEST_BATCH_SIZE         100
+
+/* Data is aligned on this many bytes (power of 2) */
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define ALIGNMENT_UNIT          64
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+#define ALIGNMENT_UNIT          32
+#else /* RTE_MACHINE_CPUFLAG */
+#define ALIGNMENT_UNIT          16
+#endif /* RTE_MACHINE_CPUFLAG */
+
+/*
+ * Pointers used in performance tests. The two large buffers are for uncached
+ * access where random addresses within the buffer are used for each
+ * memset. The two small buffers are for cached access.
+ */
+static uint8_t *large_buf_read, *large_buf_write;
+static uint8_t *small_buf_read, *small_buf_write;
+
+/* Initialise data buffers. */
+static int
+init_buffers(void)
+{
+	unsigned int i;
+
+	large_buf_read = rte_malloc("memset", LARGE_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_read == NULL)
+		goto error_large_buf_read;
+
+	large_buf_write = rte_malloc("memset", LARGE_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_write == NULL)
+		goto error_large_buf_write;
+
+	small_buf_read = rte_malloc("memset", SMALL_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_read == NULL)
+		goto error_small_buf_read;
+
+	small_buf_write = rte_malloc("memset", SMALL_BUFFER_SIZE
+					+ ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_write == NULL)
+		goto error_small_buf_write;
+
+	for (i = 0; i < LARGE_BUFFER_SIZE; i++)
+		large_buf_read[i] = rte_rand();
+	for (i = 0; i < SMALL_BUFFER_SIZE; i++)
+		small_buf_read[i] = rte_rand();
+
+	return 0;
+
+error_small_buf_write:
+	rte_free(small_buf_read);
+error_small_buf_read:
+	rte_free(large_buf_write);
+error_large_buf_write:
+	rte_free(large_buf_read);
+error_large_buf_read:
+	printf("ERROR: not enough memory\n");
+	return -1;
+}
+
+/* Cleanup data buffers */
+static void
+free_buffers(void)
+{
+	rte_free(large_buf_read);
+	rte_free(large_buf_write);
+	rte_free(small_buf_read);
+	rte_free(small_buf_write);
+}
+
+/*
+ * Get a random offset into large array, with enough space needed to perform
+ * max memset size. Offset is aligned, uoffset is used for unalignment setting.
+ */
+static inline size_t
+get_rand_offset(size_t uoffset)
+{
+	return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
+			~(ALIGNMENT_UNIT - 1)) + uoffset;
+}
+
+/* Fill in destination addresses. */
+static inline void
+fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset)
+{
+	unsigned int i;
+
+	for (i = 0; i < TEST_BATCH_SIZE; i++)
+		dst_addr[i] = (is_dst_cached) ? dst_uoffset :
+					get_rand_offset(dst_uoffset);
+}
+
+/*
+ * WORKAROUND: For some reason the first test doing an uncached write
+ * takes a very long time (~25 times longer than is expected). So we do
+ * it once without timing.
+ */
+static void
+do_uncached_write(uint8_t *dst, int is_dst_cached, size_t size)
+{
+	unsigned int i, j;
+	size_t dst_addrs[TEST_BATCH_SIZE];
+	int ch = rte_rand() & 0xff;
+
+	for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
+		fill_addr_arrays(dst_addrs, is_dst_cached, 0);
+		for (j = 0; j < TEST_BATCH_SIZE; j++)
+			rte_memset(dst+dst_addrs[j], ch, size);
+	}
+}
+
+/*
+ * Run a single memset performance test. This is a macro to ensure that if
+ * the "size" parameter is a constant it won't be converted to a variable.
+ */
+#define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset, size)             \
+do {                                                                        \
+	unsigned int iter, t;                                               \
+	size_t dst_addrs[TEST_BATCH_SIZE];                                  \
+	uint64_t start_time, total_time = 0;                                \
+	uint64_t total_time2 = 0;                                           \
+	int ch = rte_rand() & 0xff;                                         \
+									    \
+	for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {\
+	fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset);            \
+	start_time = rte_rdtsc();                                           \
+	for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+		rte_memset(dst+dst_addrs[t], ch, size);                      \
+	total_time += rte_rdtsc() - start_time;                             \
+	}                                                                   \
+	for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {\
+	fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset);            \
+	start_time = rte_rdtsc();                                           \
+	for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+		memset(dst+dst_addrs[t], ch, size);                         \
+	total_time2 += rte_rdtsc() - start_time;                            \
+	}                                                                   \
+	printf("%8.0f -",  (double)total_time / TEST_ITERATIONS);           \
+	printf("%5.0f",  (double)total_time2 / TEST_ITERATIONS);            \
+} while (0)
+
+/* Run aligned memset tests. */
+#define ALL_PERF_TESTS_FOR_SIZE(n)                                       \
+do {                                                                     \
+	if (__builtin_constant_p(n))                                     \
+		printf("\nC%6u", (unsigned int)n);                       \
+	else                                                             \
+		printf("\n%7u", (unsigned int)n);                        \
+	SINGLE_PERF_TEST(small_buf_write, 1, 0, n);                      \
+	SINGLE_PERF_TEST(large_buf_write, 0, 0, n);                      \
+} while (0)
+
+/* Run unaligned memset tests */
+#define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n)                             \
+do {                                                                     \
+	if (__builtin_constant_p(n))                                     \
+		printf("\nC%6u", (unsigned int)n);                       \
+	else                                                             \
+		printf("\n%7u", (unsigned int)n);                        \
+	SINGLE_PERF_TEST(small_buf_write, 1, 1, n);                      \
+	SINGLE_PERF_TEST(large_buf_write, 0, 1, n);                      \
+} while (0)
+
+/* Run memset tests for constant length */
+#define ALL_PERF_TEST_FOR_CONSTANT                                       \
+do {                                                                     \
+	TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U);      \
+	TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U);   \
+	TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U); \
+} while (0)
+
+/* Run all memset tests for aligned constant cases */
+static inline void
+perf_test_constant_aligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memset tests for unaligned constant cases */
+static inline void
+perf_test_constant_unaligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memset tests for aligned variable cases */
+static inline void
+perf_test_variable_aligned(void)
+{
+	unsigned int n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
+}
+
+/* Run all memset tests for unaligned variable cases */
+static inline void
+perf_test_variable_unaligned(void)
+{
+	unsigned int n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
+}
+
+/* Run all memset tests */
+static int
+perf_test(void)
+{
+	int ret;
+
+	ret = init_buffers();
+	if (ret != 0)
+		return ret;
+
+#if TEST_VALUE_RANGE != 0
+	/* Set up buf_sizes array, if required */
+	unsigned int i;
+
+	for (i = 0; i < TEST_VALUE_RANGE; i++)
+		buf_sizes[i] = i;
+#endif
+
+	/* See function comment */
+	do_uncached_write(large_buf_write, 0, SMALL_BUFFER_SIZE);
+
+	printf("\n** rte_memset() - memset perf tests \t\n  \
+	(C = compile-time constant) **\n"
+		"======== ======= ======== ======= ========\n"
+		"   Size memset in cache  memset in mem\n"
+		"(bytes)        (ticks)        (ticks)\n"
+		"------- -------------- ---------------");
+
+	printf("\n============= %2dB aligned ================", ALIGNMENT_UNIT);
+	/* Do aligned tests where size is a variable */
+	perf_test_variable_aligned();
+	printf("\n------ -------------- -------------- ------");
+	/* Do aligned tests where size is a compile-time constant */
+	perf_test_constant_aligned();
+	printf("\n============= Unaligned ===================");
+	/* Do unaligned tests where size is a variable */
+	perf_test_variable_unaligned();
+	printf("\n------ -------------- -------------- ------");
+	/* Do unaligned tests where size is a compile-time constant */
+	perf_test_constant_unaligned();
+	printf("\n====== ============== ============== =======\n\n");
+
+	free_buffers();
+
+	return 0;
+}
+
+static int
+test_memset_perf(void)
+{
+	int ret;
+
+	ret = perf_test();
+	if (ret != 0)
+		return -1;
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(memset_perf_autotest, test_memset_perf);
-- 
2.7.4



More information about the dev mailing list