[dpdk-dev,v2] rte_hash: add scalable multi-writer insertion w/ Intel TSX

Message ID 1466052753-69632-1-git-send-email-wei1.shen@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Commit Message

Shen Wei June 16, 2016, 4:52 a.m. UTC
  This patch introduced scalable multi-writer Cuckoo Hash insertion
based on a split Cuckoo Search and Move operation using Intel
TSX. It can do scalable hash insertion with 22 cores with little
performance loss and negligible TSX abortion rate.

* Added an extra rte_hash flag definition to switch default
  single writer Cuckoo Hash behavior to multiwriter.

* Added a make_space_insert_bfs_mw() function to do split Cuckoo
  search in BFS order.

* Added tsx_cuckoo_move_insert() to do Cuckoo move in Intel TSX
  protected manner.

* Added test_hash_multiwriter() as test case for multi-writer
  Cuckoo Hash.

Signed-off-by: Shen Wei <wei1.shen@intel.com>
Signed-off-by: Sameh Gobriel <sameh.gobriel@intel.com>
---
 app/test/Makefile                      |   1 +
 app/test/test_hash_multiwriter.c       | 272 +++++++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst |  12 ++
 lib/librte_hash/rte_cuckoo_hash.c      | 231 +++++++++++++++++++++++++---
 lib/librte_hash/rte_hash.h             |   3 +
 5 files changed, 494 insertions(+), 25 deletions(-)
 create mode 100644 app/test/test_hash_multiwriter.c
  

Comments

Ananyev, Konstantin June 16, 2016, 12:14 p.m. UTC | #1
Hi Wei,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Wei Shen
> Sent: Thursday, June 16, 2016 5:53 AM
> To: dev@dpdk.org
> Cc: De Lara Guarch, Pablo; stephen@networkplumber.org; Tai, Charlie; Maciocco, Christian; Gobriel, Sameh; Shen, Wei1
> Subject: [dpdk-dev] [PATCH v2] rte_hash: add scalable multi-writer insertion w/ Intel TSX
> 
> This patch introduced scalable multi-writer Cuckoo Hash insertion
> based on a split Cuckoo Search and Move operation using Intel
> TSX. It can do scalable hash insertion with 22 cores with little
> performance loss and negligible TSX abortion rate.
> 
> * Added an extra rte_hash flag definition to switch default
>   single writer Cuckoo Hash behavior to multiwriter.
> 
> * Added a make_space_insert_bfs_mw() function to do split Cuckoo
>   search in BFS order.
> 
> * Added tsx_cuckoo_move_insert() to do Cuckoo move in Intel TSX
>   protected manner.
> 
> * Added test_hash_multiwriter() as test case for multi-writer
>   Cuckoo Hash.
> 
> Signed-off-by: Shen Wei <wei1.shen@intel.com>
> Signed-off-by: Sameh Gobriel <sameh.gobriel@intel.com>
> ---
>  app/test/Makefile                      |   1 +
>  app/test/test_hash_multiwriter.c       | 272 +++++++++++++++++++++++++++++++++
>  doc/guides/rel_notes/release_16_07.rst |  12 ++
>  lib/librte_hash/rte_cuckoo_hash.c      | 231 +++++++++++++++++++++++++---
>  lib/librte_hash/rte_hash.h             |   3 +
>  5 files changed, 494 insertions(+), 25 deletions(-)
>  create mode 100644 app/test/test_hash_multiwriter.c
> 
> diff --git a/app/test/Makefile b/app/test/Makefile
> index 053f3a2..5476300 100644
> --- a/app/test/Makefile
> +++ b/app/test/Makefile
> @@ -120,6 +120,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_thash.c
>  SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_perf.c
>  SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_functions.c
>  SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_scaling.c
> +SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_multiwriter.c
> 
>  SRCS-$(CONFIG_RTE_LIBRTE_LPM) += test_lpm.c
>  SRCS-$(CONFIG_RTE_LIBRTE_LPM) += test_lpm_perf.c
> diff --git a/app/test/test_hash_multiwriter.c b/app/test/test_hash_multiwriter.c
> new file mode 100644
> index 0000000..54a0d2c
> --- /dev/null
> +++ b/app/test/test_hash_multiwriter.c
> @@ -0,0 +1,272 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *	 notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *	 notice, this list of conditions and the following disclaimer in
> + *	 the documentation and/or other materials provided with the
> + *	 distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *	 contributors may be used to endorse or promote products derived
> + *	 from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +#include <inttypes.h>
> +#include <locale.h>
> +
> +#include <rte_cycles.h>
> +#include <rte_hash.h>
> +#include <rte_hash_crc.h>
> +#include <rte_launch.h>
> +#include <rte_malloc.h>
> +#include <rte_random.h>
> +#include <rte_spinlock.h>
> +
> +#include "test.h"
> +
> +/*
> + * Check condition and return an error if true. Assumes that "handle" is the
> + * name of the hash structure pointer to be freed.
> + */
> +#define RETURN_IF_ERROR(cond, str, ...) do {                            \
> +	if (cond) {                                                     \
> +		printf("ERROR line %d: " str "\n", __LINE__,            \
> +							##__VA_ARGS__);	\
> +		if (handle)                                             \
> +			rte_hash_free(handle);                          \
> +		return -1;                                              \
> +	}                                                               \
> +} while (0)
> +
> +#define RTE_APP_TEST_HASH_MULTIWRITER_FAILED 0
> +
> +struct {
> +	uint32_t *keys;
> +	uint32_t *found;
> +	uint32_t nb_tsx_insertion;
> +	struct rte_hash *h;
> +} tbl_multiwriter_test_params;
> +
> +const uint32_t nb_entries = 16*1024*1024;
> +const uint32_t nb_total_tsx_insertion = 15*1024*1024;
> +uint32_t rounded_nb_total_tsx_insertion;
> +
> +static rte_atomic64_t gcycles;
> +static rte_atomic64_t ginsertions;
> +
> +static int
> +test_hash_multiwriter_worker(__attribute__((unused)) void *arg)
> +{
> +	uint64_t i, offset;
> +	uint32_t lcore_id = rte_lcore_id();
> +	uint64_t begin, cycles;
> +
> +	offset = (lcore_id - rte_get_master_lcore())
> +		* tbl_multiwriter_test_params.nb_tsx_insertion;
> +
> +	printf("Core #%d inserting %d: %'"PRId64" - %'"PRId64"\n",
> +	       lcore_id, tbl_multiwriter_test_params.nb_tsx_insertion,
> +	       offset, offset + tbl_multiwriter_test_params.nb_tsx_insertion);
> +
> +	begin = rte_rdtsc_precise();
> +
> +	for (i = offset;
> +	     i < offset + tbl_multiwriter_test_params.nb_tsx_insertion;
> +	     i++) {
> +		if (rte_hash_add_key(tbl_multiwriter_test_params.h,
> +				     tbl_multiwriter_test_params.keys + i) < 0)
> +			break;
> +	}
> +
> +	cycles = rte_rdtsc_precise() - begin;
> +	rte_atomic64_add(&gcycles, cycles);
> +	rte_atomic64_add(&ginsertions, i - offset);
> +
> +	for (; i < offset + tbl_multiwriter_test_params.nb_tsx_insertion; i++)
> +		tbl_multiwriter_test_params.keys[i]
> +			= RTE_APP_TEST_HASH_MULTIWRITER_FAILED;
> +
> +	return 0;
> +}
> +
> +
> +static int
> +test_hash_multiwriter(void)
> +{
> +	unsigned int i, rounded_nb_total_tsx_insertion;
> +	static unsigned calledCount = 1;
> +
> +	uint32_t *keys;
> +	uint32_t *found;
> +
> +	struct rte_hash_parameters hash_params = {
> +		.entries = nb_entries,
> +		.key_len = sizeof(uint32_t),
> +		.hash_func = rte_hash_crc,
> +		.hash_func_init_val = 0,
> +		.socket_id = rte_socket_id(),
> +		.extra_flag = RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT
> +		| RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD,
> +	};
> +
> +	struct rte_hash *handle;
> +	char name[RTE_HASH_NAMESIZE];
> +
> +	const void *next_key;
> +	void *next_data;
> +	uint32_t iter = 0;
> +
> +	uint32_t duplicated_keys = 0;
> +	uint32_t lost_keys = 0;
> +
> +	snprintf(name, 32, "test%u", calledCount++);
> +	hash_params.name = name;
> +
> +	handle = rte_hash_create(&hash_params);
> +	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
> +
> +	tbl_multiwriter_test_params.h = handle;
> +	tbl_multiwriter_test_params.nb_tsx_insertion =
> +		nb_total_tsx_insertion / rte_lcore_count();
> +
> +	rounded_nb_total_tsx_insertion = (nb_total_tsx_insertion /
> +		tbl_multiwriter_test_params.nb_tsx_insertion)
> +		* tbl_multiwriter_test_params.nb_tsx_insertion;
> +
> +	rte_srand(rte_rdtsc());
> +
> +	keys = rte_malloc(NULL, sizeof(uint32_t) * nb_entries, 0);
> +
> +	if (keys == NULL) {
> +		printf("RTE_MALLOC failed\n");
> +		goto err1;
> +	}
> +
> +	found = rte_zmalloc(NULL, sizeof(uint32_t) * nb_entries, 0);
> +	if (found == NULL) {
> +		printf("RTE_ZMALLOC failed\n");
> +		goto err2;
> +	}
> +
> +	for (i = 0; i < nb_entries; i++)
> +		keys[i] = i;
> +
> +	tbl_multiwriter_test_params.keys = keys;
> +	tbl_multiwriter_test_params.found = found;
> +
> +	rte_atomic64_init(&gcycles);
> +	rte_atomic64_clear(&gcycles);
> +
> +	rte_atomic64_init(&ginsertions);
> +	rte_atomic64_clear(&ginsertions);
> +
> +	/* Fire all threads. */
> +	rte_eal_mp_remote_launch(test_hash_multiwriter_worker,
> +				 NULL, CALL_MASTER);
> +	rte_eal_mp_wait_lcore();
> +
> +	while (rte_hash_iterate(handle, &next_key, &next_data, &iter) >= 0) {
> +		/* Search for the key in the list of keys added .*/
> +		i = *(const uint32_t *)next_key;
> +		tbl_multiwriter_test_params.found[i]++;
> +	}
> +
> +	for (i = 0; i < rounded_nb_total_tsx_insertion; i++) {
> +		if (tbl_multiwriter_test_params.keys[i]
> +		    != RTE_APP_TEST_HASH_MULTIWRITER_FAILED) {
> +			if (tbl_multiwriter_test_params.found[i] > 1) {
> +				duplicated_keys++;
> +				break;
> +			}
> +			if (tbl_multiwriter_test_params.found[i] == 0) {
> +				lost_keys++;
> +				printf("key %d is lost\n", i);
> +				break;
> +			}
> +		}
> +	}
> +
> +	if (duplicated_keys > 0) {
> +		printf("%d key duplicated\n", duplicated_keys);
> +		goto err3;
> +	}
> +
> +	if (lost_keys > 0) {
> +		printf("%d key lost\n", lost_keys);
> +		goto err3;
> +	}
> +
> +	printf("No key corrupted during multiwriter insertion.\n");
> +
> +	unsigned long long int cycles_per_insertion =
> +		rte_atomic64_read(&gcycles)/
> +		rte_atomic64_read(&ginsertions);
> +
> +	printf(" cycles per insertion: %llu\n", cycles_per_insertion);
> +
> +	rte_free(tbl_multiwriter_test_params.found);
> +	rte_free(tbl_multiwriter_test_params.keys);
> +	rte_hash_free(handle);
> +	return 0;
> +
> +err3:
> +	rte_free(tbl_multiwriter_test_params.found);
> +err2:
> +	rte_free(tbl_multiwriter_test_params.keys);
> +err1:
> +	rte_hash_free(handle);
> +	return -1;
> +}
> +
> +static int
> +test_hash_multiwriter_main(void)
> +{
> +	int r = -1;
> +
> +	if (rte_lcore_count() == 1) {
> +		printf(
> +			"More than one lcore is required to do multiwriter test\n");
> +		return 0;
> +	}
> +
> +	if (!rte_tm_supported()) {
> +		printf(
> +			"Hardware transactional memory (lock elision) is NOT supported\n");
> +		return 0;
> +	}
> +
> +	printf("Hardware transactional memory (lock elision) is supported\n");
> +
> +	setlocale(LC_NUMERIC, "");
> +
> +	r = test_hash_multiwriter();
> +
> +	return r;
> +}
> +
> +
> +static struct test_command hash_scaling_cmd = {
> +	.command = "hash_multiwriter_autotest",
> +	.callback = test_hash_multiwriter_main,
> +};
> +
> +REGISTER_TEST_COMMAND(hash_scaling_cmd);
> diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
> index 131723c..f8264fb 100644
> --- a/doc/guides/rel_notes/release_16_07.rst
> +++ b/doc/guides/rel_notes/release_16_07.rst
> @@ -70,6 +70,18 @@ New Features
>    * Enable RSS per network interface through the configuration file.
>    * Streamline the CLI code.
> 
> +* **Added multi-writer support for RTE Hash with Intel TSX.**
> +
> +  The following features/modifications have been added to rte_hash library:
> +
> +  * Enabled application developers to use an extra flag for rte_hash creation
> +    to specify default behavior (multi-thread safe/unsafe) with rte_hash_add_key
> +    function.
> +  * Changed Cuckoo search algorithm to breadth first search for multi-writer
> +    routine and split Cuckoo Search and Move operations in order to reduce
> +    transactional code region and improve TSX performance.
> +  * Added a hash multi-writer test case for test app.
> +
> 
>  Resolved Issues
>  ---------------
> diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c
> index 7b7d1f8..3cb6770 100644
> --- a/lib/librte_hash/rte_cuckoo_hash.c
> +++ b/lib/librte_hash/rte_cuckoo_hash.c
> @@ -1,7 +1,7 @@
>  /*-
>   *   BSD LICENSE
>   *
> - *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
>   *   All rights reserved.
>   *
>   *   Redistribution and use in source and binary forms, with or without
> @@ -100,7 +100,13 @@ EAL_REGISTER_TAILQ(rte_hash_tailq)
> 
>  #define KEY_ALIGNMENT			16
> 
> -#define LCORE_CACHE_SIZE		8
> +#define LCORE_CACHE_SIZE		64
> +
> +#define RTE_HASH_BFS_QUEUE_MAX_LEN  1000
> +
> +#define RTE_XABORT_CUCKOO_PATH_INVALIDED 0x4
> +
> +#define RTE_HASH_TSX_MAX_RETRY 10
> 
>  #if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
>  /*
> @@ -190,6 +196,7 @@ struct rte_hash {
>  							memory support */
>  	struct lcore_cache *local_free_slots;
>  	/**< Local cache per lcore, storing some indexes of the free slots */
> +	uint8_t multiwriter_add; /**< Multi-write safe hash add behavior */
>  } __rte_cache_aligned;
> 
>  /* Structure storing both primary and secondary hashes */
> @@ -372,7 +379,7 @@ rte_hash_create(const struct rte_hash_parameters *params)
> 
>  /*
>   * If x86 architecture is used, select appropriate compare function,
> - * which may use x86 instrinsics, otherwise use memcmp
> + * which may use x86 intrinsics, otherwise use memcmp
>   */
>  #if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
>  	/* Select function to compare keys */
> @@ -431,7 +438,16 @@ rte_hash_create(const struct rte_hash_parameters *params)
>  	h->free_slots = r;
>  	h->hw_trans_mem_support = hw_trans_mem_support;
> 
> -	/* populate the free slots ring. Entry zero is reserved for key misses */
> +	/* Turn on multi-writer only with explicit flat from user and TM
> +	 * support.
> +	 */
> +	if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD
> +	    && h->hw_trans_mem_support)
> +		h->multiwriter_add = 1;
> +	else
> +		h->multiwriter_add = 0;


Wonder why MULTIPLE-WRITER support has to be implemented only for machines with TSX support?
From initial discussion my understanding was that it would work on both arhitectures with and without TSX:
on non-TSX platforms approach with spin-lock will be used.
Do I miss something here?   

> +
> +	/* Populate free slots ring. Entry zero is reserved for key misses. */
>  	for (i = 1; i < params->entries + 1; i++)
>  		rte_ring_sp_enqueue(r, (void *)((uintptr_t) i));
> 
> @@ -599,6 +615,123 @@ make_space_bucket(const struct rte_hash *h, struct rte_hash_bucket *bkt)
> 
>  }
> 
> +struct queue_node {
> +	struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */
> +
> +	struct queue_node *prev;     /* Parent(bucket) in search path */
> +	int prev_slot;               /* Parent(slot) in search path */
> +};
> +
> +/* Shift buckets along cuckoo_path and fill the path head with new entry */
> +static inline int
> +tsx_cuckoo_move_insert(const struct rte_hash *h, struct queue_node *leaf,
> +			uint32_t leaf_slot, hash_sig_t sig,
> +			hash_sig_t alt_hash, uint32_t new_idx)
> +{
> +	unsigned try = 0;
> +	unsigned status;
> +	uint32_t prev_alt_bkt_idx;
> +
> +	struct queue_node *prev_node, *curr_node = leaf;
> +	struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt;
> +	uint32_t prev_slot, curr_slot = leaf_slot;
> +
> +	while (try < RTE_HASH_TSX_MAX_RETRY) {
> +		status = rte_xbegin();

Hmm, would it compile for non-IA platform?
As I remember, we have rte_xbegin/xend/... defined only for x86 arch,
and I don't see here any mechanism to exclude that code from compilation on non-x86 arch
(#ifdef RTE_ARCH_X86 or so).

Konstantin

> +		if (likely(status == RTE_XBEGIN_STARTED)) {
> +			while (likely(curr_node->prev != NULL)) {
> +				prev_node = curr_node->prev;
> +				prev_bkt = prev_node->bkt;
> +				prev_slot = curr_node->prev_slot;
> +
> +				prev_alt_bkt_idx
> +					= prev_bkt->signatures[prev_slot].alt
> +					    & h->bucket_bitmask;
> +
> +				if (unlikely(&h->buckets[prev_alt_bkt_idx]
> +					     != curr_bkt)) {
> +					rte_xabort(RTE_XABORT_CUCKOO_PATH_INVALIDED);
> +				}
> +
> +				/* Need to swap current/alt sig to allow later Cuckoo insert to
> +				 * move elements back to its primary bucket if available
> +				 */
> +				curr_bkt->signatures[curr_slot].alt =
> +				    prev_bkt->signatures[prev_slot].current;
> +				curr_bkt->signatures[curr_slot].current =
> +				    prev_bkt->signatures[prev_slot].alt;
> +				curr_bkt->key_idx[curr_slot]
> +				    = prev_bkt->key_idx[prev_slot];
> +
> +				curr_slot = prev_slot;
> +				curr_node = prev_node;
> +				curr_bkt = curr_node->bkt;
> +			}
> +
> +			curr_bkt->signatures[curr_slot].current = sig;
> +			curr_bkt->signatures[curr_slot].alt = alt_hash;
> +			curr_bkt->key_idx[curr_slot] = new_idx;
> +
> +			rte_xend();
> +
> +			return 0;
> +		}
> +
> +		/* If we abort we give up this cuckoo path, since most likely it's
> +		 * no longer valid as TSX detected data conflict
> +		 */
> +		try++;
> +		rte_pause();
> +	}
> +
> +	return -1;
> +}
> +
> +/*
> + * Make space for new key, using bfs Cuckoo Search and Multi-Writer safe
> + * Cuckoo
> + */
> +static inline int
> +make_space_insert_bfs_mw(const struct rte_hash *h, struct rte_hash_bucket *bkt,
> +			hash_sig_t sig, hash_sig_t alt_hash,
> +			uint32_t new_idx)
> +{
> +	unsigned i;
> +	struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN];
> +	struct queue_node *tail, *head;
> +	struct rte_hash_bucket *curr_bkt, *alt_bkt;
> +
> +	tail = queue;
> +	head = queue + 1;
> +	tail->bkt = bkt;
> +	tail->prev = NULL;
> +	tail->prev_slot = -1;
> +
> +	/* Cuckoo bfs Search */
> +	while (likely(tail != head && head <
> +		            queue + RTE_HASH_BFS_QUEUE_MAX_LEN - 4)) {
> +		curr_bkt = tail->bkt;
> +		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			if (curr_bkt->signatures[i].sig == NULL_SIGNATURE) {
> +				if (likely(tsx_cuckoo_move_insert(h, tail, i,
> +					         sig, alt_hash, new_idx) == 0))
> +					return 0;
> +			}
> +
> +			/* Enqueue new node and keep prev node info */
> +			alt_bkt = &(h->buckets[curr_bkt->signatures[i].alt
> +						    & h->bucket_bitmask]);
> +			head->bkt = alt_bkt;
> +			head->prev = tail;
> +			head->prev_slot = i;
> +			head++;
> +		}
> +		tail++;
> +	}
> +
> +	return -ENOSPC;
> +}
> +
>  /*
>   * Function called to enqueue back an index in the cache/ring,
>   * as slot has not being used and it can be used in the
> @@ -712,30 +845,78 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
>  	rte_memcpy(new_k->key, key, h->key_len);
>  	new_k->pdata = data;
> 
> -	/* Insert new entry is there is room in the primary bucket */
> -	for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> -		/* Check if slot is available */
> -		if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) {
> -			prim_bkt->signatures[i].current = sig;
> -			prim_bkt->signatures[i].alt = alt_hash;
> -			prim_bkt->key_idx[i] = new_idx;
> +	if (h->multiwriter_add) {
> +		unsigned status;
> +		unsigned try = 0;
> +
> +		while (try < RTE_HASH_TSX_MAX_RETRY) {
> +			status = rte_xbegin();
> +			if (likely(status == RTE_XBEGIN_STARTED)) {
> +				/* Insert new entry if there is room in the primary
> +				* bucket.
> +				*/
> +				for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +					/* Check if slot is available */
> +					if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) {
> +						prim_bkt->signatures[i].current = sig;
> +						prim_bkt->signatures[i].alt = alt_hash;
> +						prim_bkt->key_idx[i] = new_idx;
> +						break;
> +					}
> +				}
> +				rte_xend();
> +
> +				if (i != RTE_HASH_BUCKET_ENTRIES)
> +					return new_idx - 1;
> +
> +				break; /* break off try loop if transaction commits */
> +			} else {
> +				/* If we abort we give up this cuckoo path. */
> +				try++;
> +				rte_pause();
> +			}
> +		}
> +
> +		/* Primary bucket full, need to make space for new entry */
> +		ret = make_space_insert_bfs_mw(h, prim_bkt, sig, alt_hash,
> +		                               new_idx);
> +
> +		if (ret >= 0)
>  			return new_idx - 1;
> +
> +		/* Also search secondary bucket to get better occupancy */
> +		ret = make_space_insert_bfs_mw(h, sec_bkt, sig, alt_hash,
> +		                               new_idx);
> +
> +		if (ret >= 0)
> +			return new_idx - 1;
> +	} else {
> +		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			/* Check if slot is available */
> +			if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) {
> +				prim_bkt->signatures[i].current = sig;
> +				prim_bkt->signatures[i].alt = alt_hash;
> +				prim_bkt->key_idx[i] = new_idx;
> +				break;
> +			}
>  		}
> -	}
> 
> -	/* Primary bucket is full, so we need to make space for new entry */
> -	ret = make_space_bucket(h, prim_bkt);
> -	/*
> -	 * After recursive function.
> -	 * Insert the new entry in the position of the pushed entry
> -	 * if successful or return error and
> -	 * store the new slot back in the ring
> -	 */
> -	if (ret >= 0) {
> -		prim_bkt->signatures[ret].current = sig;
> -		prim_bkt->signatures[ret].alt = alt_hash;
> -		prim_bkt->key_idx[ret] = new_idx;
> -		return new_idx - 1;
> +		if (i != RTE_HASH_BUCKET_ENTRIES)
> +			return new_idx - 1;
> +
> +		/* Primary bucket full, need to make space for new entry
> +		 * After recursive function.
> +		 * Insert the new entry in the position of the pushed entry
> +		 * if successful or return error and
> +		 * store the new slot back in the ring
> +		 */
> +		ret = make_space_bucket(h, prim_bkt);
> +		if (ret >= 0) {
> +			prim_bkt->signatures[ret].current = sig;
> +			prim_bkt->signatures[ret].alt = alt_hash;
> +			prim_bkt->key_idx[ret] = new_idx;
> +			return new_idx - 1;
> +		}
>  	}
> 
>  	/* Error in addition, store new slot back in the ring and return error */
> diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h
> index 724315a..c9612fb 100644
> --- a/lib/librte_hash/rte_hash.h
> +++ b/lib/librte_hash/rte_hash.h
> @@ -60,6 +60,9 @@ extern "C" {
>  /** Enable Hardware transactional memory support. */
>  #define RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT	0x01
> 
> +/** Default behavior of insertion, single writer/multi writer */
> +#define RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD 0x02
> +
>  /** Signature of key that is stored internally. */
>  typedef uint32_t hash_sig_t;
> 
> --
> 2.5.5
  
Shen Wei June 16, 2016, 10:14 p.m. UTC | #2
Here's the latest version of the rte_hash multi-writer patch.
It's re-based on top of the latest head as of Jun 16, 2016.

http://dpdk.org/dev/patchwork/patch/13886/
http://dpdk.org/dev/patchwork/patch/12589/

v3 changes:

* Made spinlock as fall back behavior when developer choose to use multi-writer
  behavior while HTM is not available.
* Created a rte_cuckoo_hash_x86.h file to hold all x86-specific related
  cuckoo_hash functions. And rte_cuckoo_hash.c uses compile time flag to
  select x86 file or other platform-specific implementations. While HTM check
  is still done at runtime (same with RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT)
* Moved rte_hash private struct definitions to rte_cuckoo_hash.h, to allow
  rte_cuckoo_hash_x86.h or future platform dependent functions to include.
* Following renaming for consistent names when new platform TM support are
  added.
  - rte_hash_cuckoo_insert_mw_tm for trying insertion without moving buckets
   around.
  - rte_hash_cuckoo_move_insert_mw_tm for trying insertion by moving buckets
   around.

v2 changes:

* Address issues pointed out by reviews on mailing list.
* Removed the RTE_HASH_KEY_FLAG_MOVED flag used in v1, which would cause
  problem when key deletion happens.

Wei Shen (1):
  rte_hash: add scalable multi-writer insertion w/ Intel TSX

 app/test/Makefile                      |   1 +
 app/test/test_hash_multiwriter.c       | 287 +++++++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst |  12 ++
 lib/librte_hash/rte_cuckoo_hash.c      | 258 ++++++++++-------------------
 lib/librte_hash/rte_cuckoo_hash.h      | 219 +++++++++++++++++++++++++
 lib/librte_hash/rte_cuckoo_hash_x86.h  | 193 ++++++++++++++++++++++
 lib/librte_hash/rte_hash.h             |   3 +
 7 files changed, 796 insertions(+), 177 deletions(-)
 create mode 100644 app/test/test_hash_multiwriter.c
 create mode 100644 lib/librte_hash/rte_cuckoo_hash.h
 create mode 100644 lib/librte_hash/rte_cuckoo_hash_x86.h
  

Patch

diff --git a/app/test/Makefile b/app/test/Makefile
index 053f3a2..5476300 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -120,6 +120,7 @@  SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_thash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_perf.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_functions.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_scaling.c
+SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_multiwriter.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_LPM) += test_lpm.c
 SRCS-$(CONFIG_RTE_LIBRTE_LPM) += test_lpm_perf.c
diff --git a/app/test/test_hash_multiwriter.c b/app/test/test_hash_multiwriter.c
new file mode 100644
index 0000000..54a0d2c
--- /dev/null
+++ b/app/test/test_hash_multiwriter.c
@@ -0,0 +1,272 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in
+ *	 the documentation and/or other materials provided with the
+ *	 distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *	 contributors may be used to endorse or promote products derived
+ *	 from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <inttypes.h>
+#include <locale.h>
+
+#include <rte_cycles.h>
+#include <rte_hash.h>
+#include <rte_hash_crc.h>
+#include <rte_launch.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_spinlock.h>
+
+#include "test.h"
+
+/*
+ * Check condition and return an error if true. Assumes that "handle" is the
+ * name of the hash structure pointer to be freed.
+ */
+#define RETURN_IF_ERROR(cond, str, ...) do {                            \
+	if (cond) {                                                     \
+		printf("ERROR line %d: " str "\n", __LINE__,            \
+							##__VA_ARGS__);	\
+		if (handle)                                             \
+			rte_hash_free(handle);                          \
+		return -1;                                              \
+	}                                                               \
+} while (0)
+
+#define RTE_APP_TEST_HASH_MULTIWRITER_FAILED 0
+
+struct {
+	uint32_t *keys;
+	uint32_t *found;
+	uint32_t nb_tsx_insertion;
+	struct rte_hash *h;
+} tbl_multiwriter_test_params;
+
+const uint32_t nb_entries = 16*1024*1024;
+const uint32_t nb_total_tsx_insertion = 15*1024*1024;
+uint32_t rounded_nb_total_tsx_insertion;
+
+static rte_atomic64_t gcycles;
+static rte_atomic64_t ginsertions;
+
+static int
+test_hash_multiwriter_worker(__attribute__((unused)) void *arg)
+{
+	uint64_t i, offset;
+	uint32_t lcore_id = rte_lcore_id();
+	uint64_t begin, cycles;
+
+	offset = (lcore_id - rte_get_master_lcore())
+		* tbl_multiwriter_test_params.nb_tsx_insertion;
+
+	printf("Core #%d inserting %d: %'"PRId64" - %'"PRId64"\n",
+	       lcore_id, tbl_multiwriter_test_params.nb_tsx_insertion,
+	       offset, offset + tbl_multiwriter_test_params.nb_tsx_insertion);
+
+	begin = rte_rdtsc_precise();
+
+	for (i = offset;
+	     i < offset + tbl_multiwriter_test_params.nb_tsx_insertion;
+	     i++) {
+		if (rte_hash_add_key(tbl_multiwriter_test_params.h,
+				     tbl_multiwriter_test_params.keys + i) < 0)
+			break;
+	}
+
+	cycles = rte_rdtsc_precise() - begin;
+	rte_atomic64_add(&gcycles, cycles);
+	rte_atomic64_add(&ginsertions, i - offset);
+
+	for (; i < offset + tbl_multiwriter_test_params.nb_tsx_insertion; i++)
+		tbl_multiwriter_test_params.keys[i]
+			= RTE_APP_TEST_HASH_MULTIWRITER_FAILED;
+
+	return 0;
+}
+
+
+static int
+test_hash_multiwriter(void)
+{
+	unsigned int i, rounded_nb_total_tsx_insertion;
+	static unsigned calledCount = 1;
+
+	uint32_t *keys;
+	uint32_t *found;
+
+	struct rte_hash_parameters hash_params = {
+		.entries = nb_entries,
+		.key_len = sizeof(uint32_t),
+		.hash_func = rte_hash_crc,
+		.hash_func_init_val = 0,
+		.socket_id = rte_socket_id(),
+		.extra_flag = RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT
+		| RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD,
+	};
+
+	struct rte_hash *handle;
+	char name[RTE_HASH_NAMESIZE];
+
+	const void *next_key;
+	void *next_data;
+	uint32_t iter = 0;
+
+	uint32_t duplicated_keys = 0;
+	uint32_t lost_keys = 0;
+
+	snprintf(name, 32, "test%u", calledCount++);
+	hash_params.name = name;
+
+	handle = rte_hash_create(&hash_params);
+	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
+
+	tbl_multiwriter_test_params.h = handle;
+	tbl_multiwriter_test_params.nb_tsx_insertion =
+		nb_total_tsx_insertion / rte_lcore_count();
+
+	rounded_nb_total_tsx_insertion = (nb_total_tsx_insertion /
+		tbl_multiwriter_test_params.nb_tsx_insertion)
+		* tbl_multiwriter_test_params.nb_tsx_insertion;
+
+	rte_srand(rte_rdtsc());
+
+	keys = rte_malloc(NULL, sizeof(uint32_t) * nb_entries, 0);
+
+	if (keys == NULL) {
+		printf("RTE_MALLOC failed\n");
+		goto err1;
+	}
+
+	found = rte_zmalloc(NULL, sizeof(uint32_t) * nb_entries, 0);
+	if (found == NULL) {
+		printf("RTE_ZMALLOC failed\n");
+		goto err2;
+	}
+
+	for (i = 0; i < nb_entries; i++)
+		keys[i] = i;
+
+	tbl_multiwriter_test_params.keys = keys;
+	tbl_multiwriter_test_params.found = found;
+
+	rte_atomic64_init(&gcycles);
+	rte_atomic64_clear(&gcycles);
+
+	rte_atomic64_init(&ginsertions);
+	rte_atomic64_clear(&ginsertions);
+
+	/* Fire all threads. */
+	rte_eal_mp_remote_launch(test_hash_multiwriter_worker,
+				 NULL, CALL_MASTER);
+	rte_eal_mp_wait_lcore();
+
+	while (rte_hash_iterate(handle, &next_key, &next_data, &iter) >= 0) {
+		/* Search for the key in the list of keys added .*/
+		i = *(const uint32_t *)next_key;
+		tbl_multiwriter_test_params.found[i]++;
+	}
+
+	for (i = 0; i < rounded_nb_total_tsx_insertion; i++) {
+		if (tbl_multiwriter_test_params.keys[i]
+		    != RTE_APP_TEST_HASH_MULTIWRITER_FAILED) {
+			if (tbl_multiwriter_test_params.found[i] > 1) {
+				duplicated_keys++;
+				break;
+			}
+			if (tbl_multiwriter_test_params.found[i] == 0) {
+				lost_keys++;
+				printf("key %d is lost\n", i);
+				break;
+			}
+		}
+	}
+
+	if (duplicated_keys > 0) {
+		printf("%d key duplicated\n", duplicated_keys);
+		goto err3;
+	}
+
+	if (lost_keys > 0) {
+		printf("%d key lost\n", lost_keys);
+		goto err3;
+	}
+
+	printf("No key corrupted during multiwriter insertion.\n");
+
+	unsigned long long int cycles_per_insertion =
+		rte_atomic64_read(&gcycles)/
+		rte_atomic64_read(&ginsertions);
+
+	printf(" cycles per insertion: %llu\n", cycles_per_insertion);
+
+	rte_free(tbl_multiwriter_test_params.found);
+	rte_free(tbl_multiwriter_test_params.keys);
+	rte_hash_free(handle);
+	return 0;
+
+err3:
+	rte_free(tbl_multiwriter_test_params.found);
+err2:
+	rte_free(tbl_multiwriter_test_params.keys);
+err1:
+	rte_hash_free(handle);
+	return -1;
+}
+
+static int
+test_hash_multiwriter_main(void)
+{
+	int r = -1;
+
+	if (rte_lcore_count() == 1) {
+		printf(
+			"More than one lcore is required to do multiwriter test\n");
+		return 0;
+	}
+
+	if (!rte_tm_supported()) {
+		printf(
+			"Hardware transactional memory (lock elision) is NOT supported\n");
+		return 0;
+	}
+
+	printf("Hardware transactional memory (lock elision) is supported\n");
+
+	setlocale(LC_NUMERIC, "");
+
+	r = test_hash_multiwriter();
+
+	return r;
+}
+
+
+static struct test_command hash_scaling_cmd = {
+	.command = "hash_multiwriter_autotest",
+	.callback = test_hash_multiwriter_main,
+};
+
+REGISTER_TEST_COMMAND(hash_scaling_cmd);
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 131723c..f8264fb 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -70,6 +70,18 @@  New Features
   * Enable RSS per network interface through the configuration file.
   * Streamline the CLI code.
 
+* **Added multi-writer support for RTE Hash with Intel TSX.**
+
+  The following features/modifications have been added to rte_hash library:
+
+  * Enabled application developers to use an extra flag for rte_hash creation
+    to specify default behavior (multi-thread safe/unsafe) with rte_hash_add_key
+    function.
+  * Changed Cuckoo search algorithm to breadth first search for multi-writer
+    routine and split Cuckoo Search and Move operations in order to reduce
+    transactional code region and improve TSX performance.
+  * Added a hash multi-writer test case for test app.
+
 
 Resolved Issues
 ---------------
diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c
index 7b7d1f8..3cb6770 100644
--- a/lib/librte_hash/rte_cuckoo_hash.c
+++ b/lib/librte_hash/rte_cuckoo_hash.c
@@ -1,7 +1,7 @@ 
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
  *   All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
@@ -100,7 +100,13 @@  EAL_REGISTER_TAILQ(rte_hash_tailq)
 
 #define KEY_ALIGNMENT			16
 
-#define LCORE_CACHE_SIZE		8
+#define LCORE_CACHE_SIZE		64
+
+#define RTE_HASH_BFS_QUEUE_MAX_LEN  1000
+
+#define RTE_XABORT_CUCKOO_PATH_INVALIDED 0x4
+
+#define RTE_HASH_TSX_MAX_RETRY 10
 
 #if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 /*
@@ -190,6 +196,7 @@  struct rte_hash {
 							memory support */
 	struct lcore_cache *local_free_slots;
 	/**< Local cache per lcore, storing some indexes of the free slots */
+	uint8_t multiwriter_add; /**< Multi-write safe hash add behavior */
 } __rte_cache_aligned;
 
 /* Structure storing both primary and secondary hashes */
@@ -372,7 +379,7 @@  rte_hash_create(const struct rte_hash_parameters *params)
 
 /*
  * If x86 architecture is used, select appropriate compare function,
- * which may use x86 instrinsics, otherwise use memcmp
+ * which may use x86 intrinsics, otherwise use memcmp
  */
 #if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	/* Select function to compare keys */
@@ -431,7 +438,16 @@  rte_hash_create(const struct rte_hash_parameters *params)
 	h->free_slots = r;
 	h->hw_trans_mem_support = hw_trans_mem_support;
 
-	/* populate the free slots ring. Entry zero is reserved for key misses */
+	/* Turn on multi-writer only with explicit flat from user and TM
+	 * support.
+	 */
+	if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD
+	    && h->hw_trans_mem_support)
+		h->multiwriter_add = 1;
+	else
+		h->multiwriter_add = 0;
+
+	/* Populate free slots ring. Entry zero is reserved for key misses. */
 	for (i = 1; i < params->entries + 1; i++)
 		rte_ring_sp_enqueue(r, (void *)((uintptr_t) i));
 
@@ -599,6 +615,123 @@  make_space_bucket(const struct rte_hash *h, struct rte_hash_bucket *bkt)
 
 }
 
+struct queue_node {
+	struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */
+
+	struct queue_node *prev;     /* Parent(bucket) in search path */
+	int prev_slot;               /* Parent(slot) in search path */
+};
+
+/* Shift buckets along cuckoo_path and fill the path head with new entry */
+static inline int
+tsx_cuckoo_move_insert(const struct rte_hash *h, struct queue_node *leaf,
+			uint32_t leaf_slot, hash_sig_t sig,
+			hash_sig_t alt_hash, uint32_t new_idx)
+{
+	unsigned try = 0;
+	unsigned status;
+	uint32_t prev_alt_bkt_idx;
+
+	struct queue_node *prev_node, *curr_node = leaf;
+	struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt;
+	uint32_t prev_slot, curr_slot = leaf_slot;
+
+	while (try < RTE_HASH_TSX_MAX_RETRY) {
+		status = rte_xbegin();
+		if (likely(status == RTE_XBEGIN_STARTED)) {
+			while (likely(curr_node->prev != NULL)) {
+				prev_node = curr_node->prev;
+				prev_bkt = prev_node->bkt;
+				prev_slot = curr_node->prev_slot;
+
+				prev_alt_bkt_idx
+					= prev_bkt->signatures[prev_slot].alt
+					    & h->bucket_bitmask;
+
+				if (unlikely(&h->buckets[prev_alt_bkt_idx]
+					     != curr_bkt)) {
+					rte_xabort(RTE_XABORT_CUCKOO_PATH_INVALIDED);
+				}
+
+				/* Need to swap current/alt sig to allow later Cuckoo insert to
+				 * move elements back to its primary bucket if available
+				 */
+				curr_bkt->signatures[curr_slot].alt =
+				    prev_bkt->signatures[prev_slot].current;
+				curr_bkt->signatures[curr_slot].current =
+				    prev_bkt->signatures[prev_slot].alt;
+				curr_bkt->key_idx[curr_slot]
+				    = prev_bkt->key_idx[prev_slot];
+
+				curr_slot = prev_slot;
+				curr_node = prev_node;
+				curr_bkt = curr_node->bkt;
+			}
+
+			curr_bkt->signatures[curr_slot].current = sig;
+			curr_bkt->signatures[curr_slot].alt = alt_hash;
+			curr_bkt->key_idx[curr_slot] = new_idx;
+
+			rte_xend();
+
+			return 0;
+		}
+
+		/* If we abort we give up this cuckoo path, since most likely it's
+		 * no longer valid as TSX detected data conflict
+		 */
+		try++;
+		rte_pause();
+	}
+
+	return -1;
+}
+
+/*
+ * Make space for new key, using bfs Cuckoo Search and Multi-Writer safe
+ * Cuckoo
+ */
+static inline int
+make_space_insert_bfs_mw(const struct rte_hash *h, struct rte_hash_bucket *bkt,
+			hash_sig_t sig, hash_sig_t alt_hash,
+			uint32_t new_idx)
+{
+	unsigned i;
+	struct queue_node queue[RTE_HASH_BFS_QUEUE_MAX_LEN];
+	struct queue_node *tail, *head;
+	struct rte_hash_bucket *curr_bkt, *alt_bkt;
+
+	tail = queue;
+	head = queue + 1;
+	tail->bkt = bkt;
+	tail->prev = NULL;
+	tail->prev_slot = -1;
+
+	/* Cuckoo bfs Search */
+	while (likely(tail != head && head <
+		            queue + RTE_HASH_BFS_QUEUE_MAX_LEN - 4)) {
+		curr_bkt = tail->bkt;
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			if (curr_bkt->signatures[i].sig == NULL_SIGNATURE) {
+				if (likely(tsx_cuckoo_move_insert(h, tail, i,
+					         sig, alt_hash, new_idx) == 0))
+					return 0;
+			}
+
+			/* Enqueue new node and keep prev node info */
+			alt_bkt = &(h->buckets[curr_bkt->signatures[i].alt
+						    & h->bucket_bitmask]);
+			head->bkt = alt_bkt;
+			head->prev = tail;
+			head->prev_slot = i;
+			head++;
+		}
+		tail++;
+	}
+
+	return -ENOSPC;
+}
+
 /*
  * Function called to enqueue back an index in the cache/ring,
  * as slot has not being used and it can be used in the
@@ -712,30 +845,78 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
 	rte_memcpy(new_k->key, key, h->key_len);
 	new_k->pdata = data;
 
-	/* Insert new entry is there is room in the primary bucket */
-	for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-		/* Check if slot is available */
-		if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) {
-			prim_bkt->signatures[i].current = sig;
-			prim_bkt->signatures[i].alt = alt_hash;
-			prim_bkt->key_idx[i] = new_idx;
+	if (h->multiwriter_add) {
+		unsigned status;
+		unsigned try = 0;
+
+		while (try < RTE_HASH_TSX_MAX_RETRY) {
+			status = rte_xbegin();
+			if (likely(status == RTE_XBEGIN_STARTED)) {
+				/* Insert new entry if there is room in the primary
+				* bucket.
+				*/
+				for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+					/* Check if slot is available */
+					if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) {
+						prim_bkt->signatures[i].current = sig;
+						prim_bkt->signatures[i].alt = alt_hash;
+						prim_bkt->key_idx[i] = new_idx;
+						break;
+					}
+				}
+				rte_xend();
+
+				if (i != RTE_HASH_BUCKET_ENTRIES)
+					return new_idx - 1;
+
+				break; /* break off try loop if transaction commits */
+			} else {
+				/* If we abort we give up this cuckoo path. */
+				try++;
+				rte_pause();
+			}
+		}
+
+		/* Primary bucket full, need to make space for new entry */
+		ret = make_space_insert_bfs_mw(h, prim_bkt, sig, alt_hash,
+		                               new_idx);
+
+		if (ret >= 0)
 			return new_idx - 1;
+
+		/* Also search secondary bucket to get better occupancy */
+		ret = make_space_insert_bfs_mw(h, sec_bkt, sig, alt_hash,
+		                               new_idx);
+
+		if (ret >= 0)
+			return new_idx - 1;
+	} else {
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			/* Check if slot is available */
+			if (likely(prim_bkt->signatures[i].sig == NULL_SIGNATURE)) {
+				prim_bkt->signatures[i].current = sig;
+				prim_bkt->signatures[i].alt = alt_hash;
+				prim_bkt->key_idx[i] = new_idx;
+				break;
+			}
 		}
-	}
 
-	/* Primary bucket is full, so we need to make space for new entry */
-	ret = make_space_bucket(h, prim_bkt);
-	/*
-	 * After recursive function.
-	 * Insert the new entry in the position of the pushed entry
-	 * if successful or return error and
-	 * store the new slot back in the ring
-	 */
-	if (ret >= 0) {
-		prim_bkt->signatures[ret].current = sig;
-		prim_bkt->signatures[ret].alt = alt_hash;
-		prim_bkt->key_idx[ret] = new_idx;
-		return new_idx - 1;
+		if (i != RTE_HASH_BUCKET_ENTRIES)
+			return new_idx - 1;
+
+		/* Primary bucket full, need to make space for new entry
+		 * After recursive function.
+		 * Insert the new entry in the position of the pushed entry
+		 * if successful or return error and
+		 * store the new slot back in the ring
+		 */
+		ret = make_space_bucket(h, prim_bkt);
+		if (ret >= 0) {
+			prim_bkt->signatures[ret].current = sig;
+			prim_bkt->signatures[ret].alt = alt_hash;
+			prim_bkt->key_idx[ret] = new_idx;
+			return new_idx - 1;
+		}
 	}
 
 	/* Error in addition, store new slot back in the ring and return error */
diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h
index 724315a..c9612fb 100644
--- a/lib/librte_hash/rte_hash.h
+++ b/lib/librte_hash/rte_hash.h
@@ -60,6 +60,9 @@  extern "C" {
 /** Enable Hardware transactional memory support. */
 #define RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT	0x01
 
+/** Default behavior of insertion, single writer/multi writer */
+#define RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD 0x02
+
 /** Signature of key that is stored internally. */
 typedef uint32_t hash_sig_t;