diff mbox series

[RFC,1/2] hash: add lock free support for extendable bucket

Message ID	20190302002334.7911-2-dharmik.thakkar@arm.com (mailing list archive)
State	Superseded, archived
Delegated to:	Thomas Monjalon
Headers	From: Dharmik Thakkar <dharmik.thakkar@arm.com> To: Yipeng Wang <yipeng1.wang@intel.com>, Sameh Gobriel <sameh.gobriel@intel.com>, Bruce Richardson <bruce.richardson@intel.com>, Pablo de Lara <pablo.de.lara.guarch@intel.com>, John McNamara <john.mcnamara@intel.com>, Marko Kovacevic <marko.kovacevic@intel.com> Cc: dev@dpdk.org, Dharmik Thakkar <dharmik.thakkar@arm.com> Date: Fri, 1 Mar 2019 18:23:33 -0600 Message-Id: <20190302002334.7911-2-dharmik.thakkar@arm.com> In-Reply-To: <20190302002334.7911-1-dharmik.thakkar@arm.com> References: <20190302002334.7911-1-dharmik.thakkar@arm.com> Subject: [dpdk-dev] [RFC 1/2] hash: add lock free support for extendable bucket Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	hash: add lock free support for ext bkt \| [RFC,0/2] hash: add lock free support for ext bkt [RFC,1/2] hash: add lock free support for extendable bucket [RFC,2/2] test/hash: lock-free rw concurrency test ext bkt

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK
ci/Intel-compilation	success	Compilation OK

Commit Message

Dharmik Thakkar March 2, 2019, 12:23 a.m. UTC

  This patch enables lock-free read-write concurrency support for
extendable bucket feature.

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 doc/guides/prog_guide/hash_lib.rst |   3 +-
 lib/librte_hash/rte_cuckoo_hash.c  | 150 +++++++++++++++++++----------
 lib/librte_hash/rte_cuckoo_hash.h  |   8 ++
 3 files changed, 110 insertions(+), 51 deletions(-)

Comments

Wang, Yipeng1 March 7, 2019, 5:49 p.m. UTC | #1

Thanks for this patch! Some initial comments inlined:

>-----Original Message-----
>From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
>Sent: Friday, March 1, 2019 4:24 PM
>To: Wang, Yipeng1 <yipeng1.wang@intel.com>; Gobriel, Sameh <sameh.gobriel@intel.com>; Richardson, Bruce
><bruce.richardson@intel.com>; De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>; Mcnamara, John
><john.mcnamara@intel.com>; Kovacevic, Marko <marko.kovacevic@intel.com>
>Cc: dev@dpdk.org; Dharmik Thakkar <dharmik.thakkar@arm.com>
>Subject: [RFC 1/2] hash: add lock free support for extendable bucket
>
>This patch enables lock-free read-write concurrency support for
>extendable bucket feature.
>
>Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
>---
> doc/guides/prog_guide/hash_lib.rst |   3 +-
> lib/librte_hash/rte_cuckoo_hash.c  | 150 +++++++++++++++++++----------
> lib/librte_hash/rte_cuckoo_hash.h  |   8 ++
> 3 files changed, 110 insertions(+), 51 deletions(-)
>
>diff --git a/doc/guides/prog_guide/hash_lib.rst b/doc/guides/prog_guide/hash_lib.rst
>index 85a6edfa8b16..b00446e949ba 100644
>--- a/doc/guides/prog_guide/hash_lib.rst
>+++ b/doc/guides/prog_guide/hash_lib.rst
>@@ -108,8 +108,7 @@ Extendable Bucket Functionality support
> An extra flag is used to enable this functionality (flag is not set by default). When the (RTE_HASH_EXTRA_FLAGS_EXT_TABLE) is set
>and
> in the very unlikely case due to excessive hash collisions that a key has failed to be inserted, the hash table bucket is extended with a
>linked
> list to insert these failed keys. This feature is important for the workloads (e.g. telco workloads) that need to insert up to 100% of the
>-hash table size and can't tolerate any key insertion failure (even if very few). Currently the extendable bucket is not supported
>-with the lock-free concurrency implementation (RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF).
>+hash table size and can't tolerate any key insertion failure (even if very few).
>
>
> Implementation Details (non Extendable Bucket Case)
>diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c
>index c01489ba5193..54762533ce36 100644
>--- a/lib/librte_hash/rte_cuckoo_hash.c
>+++ b/lib/librte_hash/rte_cuckoo_hash.c
>@@ -170,15 +170,6 @@ rte_hash_create(const struct rte_hash_parameters *params)
> 		return NULL;
> 	}
>
>-	if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) &&
>-	    (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)) {
>-		rte_errno = EINVAL;
>-		RTE_LOG(ERR, HASH, "rte_hash_create: extendable bucket "
>-			"feature not supported with rw concurrency "
>-			"lock free\n");
>-		return NULL;
>-	}
>-
> 	/* Check extra flags field to check extra options. */
> 	if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT)
> 		hw_trans_mem_support = 1;
>@@ -1054,7 +1045,15 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
> 			/* Check if slot is available */
> 			if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) {
> 				cur_bkt->sig_current[i] = short_sig;
>-				cur_bkt->key_idx[i] = new_idx;
>+				/* Key can be of arbitrary length, so it is
>+				 * not possible to store it atomically.
>+				 * Hence the new key element's memory stores
>+				 * (key as well as data) should be complete
>+				 * before it is referenced.
>+				 */
>+				__atomic_store_n(&cur_bkt->key_idx[i],
>+						 new_idx,
>+						 __ATOMIC_RELEASE);
> 				__hash_rw_writer_unlock(h);
> 				return new_idx - 1;
> 			}
>@@ -1072,10 +1071,23 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
> 	bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
> 	/* Use the first location of the new bucket */
> 	(h->buckets_ext[bkt_id]).sig_current[0] = short_sig;
>-	(h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
>+	/* Key can be of arbitrary length, so it is
>+	 * not possible to store it atomically.
>+	 * Hence the new key element's memory stores
>+	 * (key as well as data) should be complete
>+	 * before it is referenced.
>+	 */
>+	__atomic_store_n(&(h->buckets_ext[bkt_id]).key_idx[0],
>+			 new_idx,
>+			 __ATOMIC_RELEASE);
 [Wang, Yipeng] Since it has not been linked and later on the linking is protected by
release, do we really need atomic store here?

> 	/* Link the new bucket to sec bucket linked list */
> 	last = rte_hash_get_last_bkt(sec_bkt);
>-	last->next = &h->buckets_ext[bkt_id];
>+	/* New bucket's memory stores (key as well as data)
>+	 * should be complete before it is referenced
>+	 */
>+	__atomic_store_n(&last->next,
>+			 &h->buckets_ext[bkt_id],
>+			 __ATOMIC_RELEASE);
> 	__hash_rw_writer_unlock(h);
> 	return new_idx - 1;
>
>@@ -1366,7 +1378,8 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i)
>  * empty slot.
>  */
> static inline void
>-__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
>+__rte_hash_compact_ll(const struct rte_hash *h,
>+			struct rte_hash_bucket *cur_bkt, int pos) {
> 	int i;
> 	struct rte_hash_bucket *last_bkt;
>
>@@ -1377,10 +1390,27 @@ __rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
>
> 	for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) {
> 		if (last_bkt->key_idx[i] != EMPTY_SLOT) {
>-			cur_bkt->key_idx[pos] = last_bkt->key_idx[i];
> 			cur_bkt->sig_current[pos] = last_bkt->sig_current[i];
>+			__atomic_store_n(&cur_bkt->key_idx[pos],
>+					 last_bkt->key_idx[i],
>+					 __ATOMIC_RELEASE);
>+			if (h->readwrite_concur_lf_support) {
>+				/* Inform the readers that the table has changed
>+				 * Since there is one writer, load acquires on
>+				 * tbl_chng_cnt are not required.
>+				 */
>+				__atomic_store_n(h->tbl_chng_cnt,
>+					 *h->tbl_chng_cnt + 1,
>+					 __ATOMIC_RELEASE);
>+				/* The stores to sig_alt and sig_current should
>+				 * not move above the store to tbl_chng_cnt.
>+				 */
>+				__atomic_thread_fence(__ATOMIC_RELEASE);
>+			}
> 			last_bkt->sig_current[i] = NULL_SIGNATURE;
>-			last_bkt->key_idx[i] = EMPTY_SLOT;
>+			__atomic_store_n(&last_bkt->key_idx[i],
>+					 EMPTY_SLOT,
>+					 __ATOMIC_RELEASE);
> 			return;
> 		}
> 	}
>@@ -1449,7 +1479,7 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
> 	/* look for key in primary bucket */
> 	ret = search_and_remove(h, key, prim_bkt, short_sig, &pos);
> 	if (ret != -1) {
>-		__rte_hash_compact_ll(prim_bkt, pos);
>+		__rte_hash_compact_ll(h, prim_bkt, pos);
> 		last_bkt = prim_bkt->next;
> 		prev_bkt = prim_bkt;
> 		goto return_bkt;
>@@ -1461,7 +1491,7 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
> 	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
> 		ret = search_and_remove(h, key, cur_bkt, short_sig, &pos);
> 		if (ret != -1) {
>-			__rte_hash_compact_ll(cur_bkt, pos);
>+			__rte_hash_compact_ll(h, cur_bkt, pos);
> 			last_bkt = sec_bkt->next;
> 			prev_bkt = sec_bkt;
> 			goto return_bkt;
>@@ -1488,11 +1518,21 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
> 	}
> 	/* found empty bucket and recycle */
> 	if (i == RTE_HASH_BUCKET_ENTRIES) {
>-		prev_bkt->next = last_bkt->next = NULL;
>+		__atomic_store_n(&prev_bkt->next,
>+				 NULL,
>+				 __ATOMIC_RELEASE);
> 		uint32_t index = last_bkt - h->buckets_ext + 1;
>-		rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
>-	}
>+		if (!h->no_free_on_del)
>+			rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
>+		else {
>+			struct rte_hash_key *key_slot =
>+				(struct rte_hash_key *)(
>+				(char *)h->key_store +
>+				ret * h->key_entry_size);
>+			key_slot->ext_bkt_to_free = index;
[Wang, Yipeng] Is there chance that a key_slot may already have one previous ext_bkt
and now got overwritten, so that the previous one gone forever?
>
>+		}
>+	}
> 	__hash_rw_writer_unlock(h);
> 	return ret;
> }
>@@ -1567,6 +1607,14 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
> 				(void *)((uintptr_t)position));
> 	}
>
>+	const struct rte_hash_key *key_slot = (const struct rte_hash_key *)(
>+						(const char *)h->key_store +
>+						position * h->key_entry_size);
>+	uint32_t index = key_slot->ext_bkt_to_free;
>+	if (!index)
>+		/* Recycle empty ext bkt to free list. */
>+		rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
>+
> 	return 0;
> }
>
>@@ -1855,6 +1903,9 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
> 		rte_prefetch0(secondary_bkt[i]);
> 	}
>
>+	for (i = 0; i < num_keys; i++)
>+		positions[i] = -ENOENT;
[Wang, Yipeng] So is this for performance reason?
>+
> 	do {
> 		/* Load the table change counter before the lookup
> 		 * starts. Acquire semantics will make sure that
>@@ -1899,7 +1950,6 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
>
> 		/* Compare keys, first hits in primary first */
> 		for (i = 0; i < num_keys; i++) {
>-			positions[i] = -ENOENT;
> 			while (prim_hitmask[i]) {
> 				uint32_t hit_index =
> 						__builtin_ctzl(prim_hitmask[i])
>@@ -1972,6 +2022,36 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
> 			continue;
> 		}
>
>+
>+		/* all found, do not need to go through ext bkt */
>+		if (hits == ((1ULL << num_keys) - 1)) {
>+			if (hit_mask != NULL)
>+				*hit_mask = hits;

[Wang, Yipeng] I think you need to check the version counter before return,
and how about the fence?
>+			return;
>+		}
>+		/* need to check ext buckets for match */
>+		if (h->ext_table_support) {
>+			for (i = 0; i < num_keys; i++) {
>+				if ((hits & (1ULL << i)) != 0)
>+					continue;
>+				next_bkt = secondary_bkt[i]->next;
>+				FOR_EACH_BUCKET(cur_bkt, next_bkt) {
>+					if (data != NULL)
>+						ret = search_one_bucket_lf(h,
>+							keys[i], sig[i],
>+							&data[i], cur_bkt);
>+					else
>+						ret = search_one_bucket_lf(h,
>+								keys[i], sig[i],
>+								NULL, cur_bkt);
>+					if (ret != -1) {
>+						positions[i] = ret;
>+						hits |= 1ULL << i;
>+						break;
>+					}
>+				}
>+			}
>+		}
> 		/* The loads of sig_current in compare_signatures
> 		 * should not move below the load from tbl_chng_cnt.
> 		 */
>@@ -1988,34 +2068,6 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
> 					__ATOMIC_ACQUIRE);
> 	} while (cnt_b != cnt_a);
>
>-	/* all found, do not need to go through ext bkt */
>-	if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
>-		if (hit_mask != NULL)
>-			*hit_mask = hits;
>-		__hash_rw_reader_unlock(h);
>-		return;
>-	}
>-
>-	/* need to check ext buckets for match */
>-	for (i = 0; i < num_keys; i++) {
>-		if ((hits & (1ULL << i)) != 0)
>-			continue;
>-		next_bkt = secondary_bkt[i]->next;
>-		FOR_EACH_BUCKET(cur_bkt, next_bkt) {
>-			if (data != NULL)
>-				ret = search_one_bucket_lf(h, keys[i],
>-						sig[i], &data[i], cur_bkt);
>-			else
>-				ret = search_one_bucket_lf(h, keys[i],
>-						sig[i], NULL, cur_bkt);
>-			if (ret != -1) {
>-				positions[i] = ret;
>-				hits |= 1ULL << i;
>-				break;
>-			}
>-		}
>-	}
>-
> 	if (hit_mask != NULL)
> 		*hit_mask = hits;
> }
>diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h
>index eacdaa8d4684..062cc2dd0296 100644
>--- a/lib/librte_hash/rte_cuckoo_hash.h
>+++ b/lib/librte_hash/rte_cuckoo_hash.h
>@@ -129,6 +129,14 @@ struct lcore_cache {
>
> /* Structure that stores key-value pair */
> struct rte_hash_key {
>+	/* Stores index of an empty ext bkt to be recycled on calling
>+	 * rte_hash_del_xxx APIs. When lock free read-wrie concurrency is
[Wang, Yipeng] typo
>+	 * enabled, an empty ext bkt cannot be put into free list immediately
>+	 * (as readers might be using it still). Hence freeing of the ext bkt
>+	 * is piggy-backed to freeing of the key index.
>+	 */
[Wang, Yipeng] I am thinking if this breaks the "guarantee" provided by ext table, Since
a whole bucket could not be reused if one key not freed. Is there any fundamental issue with
a new API to recycle ext bucket or you just do not want to add a new API?
>+	uint32_t ext_bkt_to_free;
>+
> 	union {
> 		uintptr_t idata;
> 		void *pdata;
>--
>2.17.1

Dharmik Thakkar March 7, 2019, 10:14 p.m. UTC | #2

+ Honnappa

Hi Yipeng,

Thank you for the review comments!


> On Mar 7, 2019, at 11:49 AM, Wang, Yipeng1 <yipeng1.wang@intel.com> wrote:
> 
> Thanks for this patch! Some initial comments inlined:
> 
>> -----Original Message-----
>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
>> Sent: Friday, March 1, 2019 4:24 PM
>> To: Wang, Yipeng1 <yipeng1.wang@intel.com>; Gobriel, Sameh <sameh.gobriel@intel.com>; Richardson, Bruce
>> <bruce.richardson@intel.com>; De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>; Mcnamara, John
>> <john.mcnamara@intel.com>; Kovacevic, Marko <marko.kovacevic@intel.com>
>> Cc: dev@dpdk.org; Dharmik Thakkar <dharmik.thakkar@arm.com>
>> Subject: [RFC 1/2] hash: add lock free support for extendable bucket
>> 
>> This patch enables lock-free read-write concurrency support for
>> extendable bucket feature.
>> 
>> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
>> ---
>> doc/guides/prog_guide/hash_lib.rst |   3 +-
>> lib/librte_hash/rte_cuckoo_hash.c  | 150 +++++++++++++++++++----------
>> lib/librte_hash/rte_cuckoo_hash.h  |   8 ++
>> 3 files changed, 110 insertions(+), 51 deletions(-)
>> 
>> diff --git a/doc/guides/prog_guide/hash_lib.rst b/doc/guides/prog_guide/hash_lib.rst
>> index 85a6edfa8b16..b00446e949ba 100644
>> --- a/doc/guides/prog_guide/hash_lib.rst
>> +++ b/doc/guides/prog_guide/hash_lib.rst
>> @@ -108,8 +108,7 @@ Extendable Bucket Functionality support
>> An extra flag is used to enable this functionality (flag is not set by default). When the (RTE_HASH_EXTRA_FLAGS_EXT_TABLE) is set
>> and
>> in the very unlikely case due to excessive hash collisions that a key has failed to be inserted, the hash table bucket is extended with a
>> linked
>> list to insert these failed keys. This feature is important for the workloads (e.g. telco workloads) that need to insert up to 100% of the
>> -hash table size and can't tolerate any key insertion failure (even if very few). Currently the extendable bucket is not supported
>> -with the lock-free concurrency implementation (RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF).
>> +hash table size and can't tolerate any key insertion failure (even if very few).
>> 
>> 
>> Implementation Details (non Extendable Bucket Case)
>> diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c
>> index c01489ba5193..54762533ce36 100644
>> --- a/lib/librte_hash/rte_cuckoo_hash.c
>> +++ b/lib/librte_hash/rte_cuckoo_hash.c
>> @@ -170,15 +170,6 @@ rte_hash_create(const struct rte_hash_parameters *params)
>> 		return NULL;
>> 	}
>> 
>> -	if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) &&
>> -	    (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)) {
>> -		rte_errno = EINVAL;
>> -		RTE_LOG(ERR, HASH, "rte_hash_create: extendable bucket "
>> -			"feature not supported with rw concurrency "
>> -			"lock free\n");
>> -		return NULL;
>> -	}
>> -
>> 	/* Check extra flags field to check extra options. */
>> 	if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT)
>> 		hw_trans_mem_support = 1;
>> @@ -1054,7 +1045,15 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
>> 			/* Check if slot is available */
>> 			if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) {
>> 				cur_bkt->sig_current[i] = short_sig;
>> -				cur_bkt->key_idx[i] = new_idx;
>> +				/* Key can be of arbitrary length, so it is
>> +				 * not possible to store it atomically.
>> +				 * Hence the new key element's memory stores
>> +				 * (key as well as data) should be complete
>> +				 * before it is referenced.
>> +				 */
>> +				__atomic_store_n(&cur_bkt->key_idx[i],
>> +						 new_idx,
>> +						 __ATOMIC_RELEASE);
>> 				__hash_rw_writer_unlock(h);
>> 				return new_idx - 1;
>> 			}
>> @@ -1072,10 +1071,23 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
>> 	bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
>> 	/* Use the first location of the new bucket */
>> 	(h->buckets_ext[bkt_id]).sig_current[0] = short_sig;
>> -	(h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
>> +	/* Key can be of arbitrary length, so it is
>> +	 * not possible to store it atomically.
>> +	 * Hence the new key element's memory stores
>> +	 * (key as well as data) should be complete
>> +	 * before it is referenced.
>> +	 */
>> +	__atomic_store_n(&(h->buckets_ext[bkt_id]).key_idx[0],
>> +			 new_idx,
>> +			 __ATOMIC_RELEASE);
> [Wang, Yipeng] Since it has not been linked and later on the linking is protected by
> release, do we really need atomic store here?
Atomic store is used here to maintain the code consistency.
> 
>> 	/* Link the new bucket to sec bucket linked list */
>> 	last = rte_hash_get_last_bkt(sec_bkt);
>> -	last->next = &h->buckets_ext[bkt_id];
>> +	/* New bucket's memory stores (key as well as data)
>> +	 * should be complete before it is referenced
>> +	 */
>> +	__atomic_store_n(&last->next,
>> +			 &h->buckets_ext[bkt_id],
>> +			 __ATOMIC_RELEASE);
>> 	__hash_rw_writer_unlock(h);
>> 	return new_idx - 1;
>> 
>> @@ -1366,7 +1378,8 @@ remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i)
>> * empty slot.
>> */
>> static inline void
>> -__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
>> +__rte_hash_compact_ll(const struct rte_hash *h,
>> +			struct rte_hash_bucket *cur_bkt, int pos) {
>> 	int i;
>> 	struct rte_hash_bucket *last_bkt;
>> 
>> @@ -1377,10 +1390,27 @@ __rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
>> 
>> 	for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) {
>> 		if (last_bkt->key_idx[i] != EMPTY_SLOT) {
>> -			cur_bkt->key_idx[pos] = last_bkt->key_idx[i];
>> 			cur_bkt->sig_current[pos] = last_bkt->sig_current[i];
>> +			__atomic_store_n(&cur_bkt->key_idx[pos],
>> +					 last_bkt->key_idx[i],
>> +					 __ATOMIC_RELEASE);
>> +			if (h->readwrite_concur_lf_support) {
>> +				/* Inform the readers that the table has changed
>> +				 * Since there is one writer, load acquires on
>> +				 * tbl_chng_cnt are not required.
>> +				 */
>> +				__atomic_store_n(h->tbl_chng_cnt,
>> +					 *h->tbl_chng_cnt + 1,
>> +					 __ATOMIC_RELEASE);
>> +				/* The stores to sig_alt and sig_current should
>> +				 * not move above the store to tbl_chng_cnt.
>> +				 */
>> +				__atomic_thread_fence(__ATOMIC_RELEASE);
>> +			}
>> 			last_bkt->sig_current[i] = NULL_SIGNATURE;
>> -			last_bkt->key_idx[i] = EMPTY_SLOT;
>> +			__atomic_store_n(&last_bkt->key_idx[i],
>> +					 EMPTY_SLOT,
>> +					 __ATOMIC_RELEASE);
>> 			return;
>> 		}
>> 	}
>> @@ -1449,7 +1479,7 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
>> 	/* look for key in primary bucket */
>> 	ret = search_and_remove(h, key, prim_bkt, short_sig, &pos);
>> 	if (ret != -1) {
>> -		__rte_hash_compact_ll(prim_bkt, pos);
>> +		__rte_hash_compact_ll(h, prim_bkt, pos);
>> 		last_bkt = prim_bkt->next;
>> 		prev_bkt = prim_bkt;
>> 		goto return_bkt;
>> @@ -1461,7 +1491,7 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
>> 	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
>> 		ret = search_and_remove(h, key, cur_bkt, short_sig, &pos);
>> 		if (ret != -1) {
>> -			__rte_hash_compact_ll(cur_bkt, pos);
>> +			__rte_hash_compact_ll(h, cur_bkt, pos);
>> 			last_bkt = sec_bkt->next;
>> 			prev_bkt = sec_bkt;
>> 			goto return_bkt;
>> @@ -1488,11 +1518,21 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
>> 	}
>> 	/* found empty bucket and recycle */
>> 	if (i == RTE_HASH_BUCKET_ENTRIES) {
>> -		prev_bkt->next = last_bkt->next = NULL;
>> +		__atomic_store_n(&prev_bkt->next,
>> +				 NULL,
>> +				 __ATOMIC_RELEASE);
>> 		uint32_t index = last_bkt - h->buckets_ext + 1;
>> -		rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
>> -	}
>> +		if (!h->no_free_on_del)
>> +			rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
>> +		else {
>> +			struct rte_hash_key *key_slot =
>> +				(struct rte_hash_key *)(
>> +				(char *)h->key_store +
>> +				ret * h->key_entry_size);
>> +			key_slot->ext_bkt_to_free = index;
> [Wang, Yipeng] Is there chance that a key_slot may already have one previous ext_bkt
> and now got overwritten, so that the previous one gone forever?
No, it is not possible. Since, the index is being stored in a key_slot which is associated with a deleted key.
>> 
>> +		}
>> +	}
>> 	__hash_rw_writer_unlock(h);
>> 	return ret;
>> }
>> @@ -1567,6 +1607,14 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
>> 				(void *)((uintptr_t)position));
>> 	}
>> 
>> +	const struct rte_hash_key *key_slot = (const struct rte_hash_key *)(
>> +						(const char *)h->key_store +
>> +						position * h->key_entry_size);
>> +	uint32_t index = key_slot->ext_bkt_to_free;
>> +	if (!index)
>> +		/* Recycle empty ext bkt to free list. */
>> +		rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
>> +
>> 	return 0;
>> }
>> 
>> @@ -1855,6 +1903,9 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
>> 		rte_prefetch0(secondary_bkt[i]);
>> 	}
>> 
>> +	for (i = 0; i < num_keys; i++)
>> +		positions[i] = -ENOENT;
> [Wang, Yipeng] So is this for performance reason?
Resetting positions[] on each iteration of do…while() require ‘hits’ to be reset as well, which causes performance hit.
>> +
>> 	do {
>> 		/* Load the table change counter before the lookup
>> 		 * starts. Acquire semantics will make sure that
>> @@ -1899,7 +1950,6 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
>> 
>> 		/* Compare keys, first hits in primary first */
>> 		for (i = 0; i < num_keys; i++) {
>> -			positions[i] = -ENOENT;
>> 			while (prim_hitmask[i]) {
>> 				uint32_t hit_index =
>> 						__builtin_ctzl(prim_hitmask[i])
>> @@ -1972,6 +2022,36 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
>> 			continue;
>> 		}
>> 
>> +
>> +		/* all found, do not need to go through ext bkt */
>> +		if (hits == ((1ULL << num_keys) - 1)) {
>> +			if (hit_mask != NULL)
>> +				*hit_mask = hits;
> 
> [Wang, Yipeng] I think you need to check the version counter before return,
> and how about the fence?
If all the keys are found, there is no need to check the counter.
>> +			return;
>> +		}
>> +		/* need to check ext buckets for match */
>> +		if (h->ext_table_support) {
>> +			for (i = 0; i < num_keys; i++) {
>> +				if ((hits & (1ULL << i)) != 0)
>> +					continue;
>> +				next_bkt = secondary_bkt[i]->next;
>> +				FOR_EACH_BUCKET(cur_bkt, next_bkt) {
>> +					if (data != NULL)
>> +						ret = search_one_bucket_lf(h,
>> +							keys[i], sig[i],
>> +							&data[i], cur_bkt);
>> +					else
>> +						ret = search_one_bucket_lf(h,
>> +								keys[i], sig[i],
>> +								NULL, cur_bkt);
>> +					if (ret != -1) {
>> +						positions[i] = ret;
>> +						hits |= 1ULL << i;
>> +						break;
>> +					}
>> +				}
>> +			}
>> +		}
>> 		/* The loads of sig_current in compare_signatures
>> 		 * should not move below the load from tbl_chng_cnt.
>> 		 */
>> @@ -1988,34 +2068,6 @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
>> 					__ATOMIC_ACQUIRE);
>> 	} while (cnt_b != cnt_a);
>> 
>> -	/* all found, do not need to go through ext bkt */
>> -	if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
>> -		if (hit_mask != NULL)
>> -			*hit_mask = hits;
>> -		__hash_rw_reader_unlock(h);
>> -		return;
>> -	}
>> -
>> -	/* need to check ext buckets for match */
>> -	for (i = 0; i < num_keys; i++) {
>> -		if ((hits & (1ULL << i)) != 0)
>> -			continue;
>> -		next_bkt = secondary_bkt[i]->next;
>> -		FOR_EACH_BUCKET(cur_bkt, next_bkt) {
>> -			if (data != NULL)
>> -				ret = search_one_bucket_lf(h, keys[i],
>> -						sig[i], &data[i], cur_bkt);
>> -			else
>> -				ret = search_one_bucket_lf(h, keys[i],
>> -						sig[i], NULL, cur_bkt);
>> -			if (ret != -1) {
>> -				positions[i] = ret;
>> -				hits |= 1ULL << i;
>> -				break;
>> -			}
>> -		}
>> -	}
>> -
>> 	if (hit_mask != NULL)
>> 		*hit_mask = hits;
>> }
>> diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h
>> index eacdaa8d4684..062cc2dd0296 100644
>> --- a/lib/librte_hash/rte_cuckoo_hash.h
>> +++ b/lib/librte_hash/rte_cuckoo_hash.h
>> @@ -129,6 +129,14 @@ struct lcore_cache {
>> 
>> /* Structure that stores key-value pair */
>> struct rte_hash_key {
>> +	/* Stores index of an empty ext bkt to be recycled on calling
>> +	 * rte_hash_del_xxx APIs. When lock free read-wrie concurrency is
> [Wang, Yipeng] typo
Will update it in the next version.
>> +	 * enabled, an empty ext bkt cannot be put into free list immediately
>> +	 * (as readers might be using it still). Hence freeing of the ext bkt
>> +	 * is piggy-backed to freeing of the key index.
>> +	 */
> [Wang, Yipeng] I am thinking if this breaks the "guarantee" provided by ext table, Since
> a whole bucket could not be reused if one key not freed. Is there any fundamental issue with
> a new API to recycle ext bucket or you just do not want to add a new API?
With lock-free feature, ‘delete’ becomes a two step process of ‘delete’ and ‘free’. In other words,
it can be viewed by the applications as a 'prolonged delete’. I’m not sure how adding a new API
to recycle ext bucket will help solving the issue.
>> +	uint32_t ext_bkt_to_free;
>> +
>> 	union {
>> 		uintptr_t idata;
>> 		void *pdata;
>> --
>> 2.17.1

Honnappa Nagarahalli March 15, 2019, 12:31 a.m. UTC | #3

<snip>

> >> @@ -1072,10 +1071,23 @@ __rte_hash_add_key_with_hash(const struct
> rte_hash *h, const void *key,
> >> 	bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
> >> 	/* Use the first location of the new bucket */
> >> 	(h->buckets_ext[bkt_id]).sig_current[0] = short_sig;
> >> -	(h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
> >> +	/* Key can be of arbitrary length, so it is
> >> +	 * not possible to store it atomically.
> >> +	 * Hence the new key element's memory stores
> >> +	 * (key as well as data) should be complete
> >> +	 * before it is referenced.
> >> +	 */
> >> +	__atomic_store_n(&(h->buckets_ext[bkt_id]).key_idx[0],
> >> +			 new_idx,
> >> +			 __ATOMIC_RELEASE);
> > [Wang, Yipeng] Since it has not been linked and later on the linking
> > is protected by release, do we really need atomic store here?
> Atomic store is used here to maintain the code consistency.
Agree the release order is not required. Removing it does not help much as it is only for the 1st element of a new bucket.

> >
> >> 	/* Link the new bucket to sec bucket linked list */
> >> 	last = rte_hash_get_last_bkt(sec_bkt);
> >> -	last->next = &h->buckets_ext[bkt_id];
> >> +	/* New bucket's memory stores (key as well as data)
> >> +	 * should be complete before it is referenced
> >> +	 */
> >> +	__atomic_store_n(&last->next,
> >> +			 &h->buckets_ext[bkt_id],
> >> +			 __ATOMIC_RELEASE);
The corresponding load-acquire is missing in the 'FOR_EACH_BUCKET' macro

> >> 	__hash_rw_writer_unlock(h);
> >> 	return new_idx - 1;
> >>
> >> @@ -1366,7 +1378,8 @@ remove_entry(const struct rte_hash *h, struct
> >> rte_hash_bucket *bkt, unsigned i)
> >> * empty slot.
> >> */
> >> static inline void
> >> -__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
> >> +__rte_hash_compact_ll(const struct rte_hash *h,
> >> +			struct rte_hash_bucket *cur_bkt, int pos) {
> >> 	int i;
> >> 	struct rte_hash_bucket *last_bkt;
> >>
> >> @@ -1377,10 +1390,27 @@ __rte_hash_compact_ll(struct
> rte_hash_bucket
> >> *cur_bkt, int pos) {
> >>
> >> 	for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) {
> >> 		if (last_bkt->key_idx[i] != EMPTY_SLOT) {
> >> -			cur_bkt->key_idx[pos] = last_bkt->key_idx[i];
> >> 			cur_bkt->sig_current[pos] = last_bkt->sig_current[i];
> >> +			__atomic_store_n(&cur_bkt->key_idx[pos],
> >> +					 last_bkt->key_idx[i],
> >> +					 __ATOMIC_RELEASE);
> >> +			if (h->readwrite_concur_lf_support) {
> >> +				/* Inform the readers that the table has
> changed
> >> +				 * Since there is one writer, load acquires on
> >> +				 * tbl_chng_cnt are not required.
> >> +				 */
> >> +				__atomic_store_n(h->tbl_chng_cnt,
> >> +					 *h->tbl_chng_cnt + 1,
> >> +					 __ATOMIC_RELEASE);
> >> +				/* The stores to sig_alt and sig_current should
> >> +				 * not move above the store to tbl_chng_cnt.
> >> +				 */
> >> +				__atomic_thread_fence(__ATOMIC_RELEASE);
> >> +			}
> >> 			last_bkt->sig_current[i] = NULL_SIGNATURE;
> >> -			last_bkt->key_idx[i] = EMPTY_SLOT;
> >> +			__atomic_store_n(&last_bkt->key_idx[i],
> >> +					 EMPTY_SLOT,
> >> +					 __ATOMIC_RELEASE);
> >> 			return;
> >> 		}
> >> 	}
> >> @@ -1449,7 +1479,7 @@ __rte_hash_del_key_with_hash(const struct
> rte_hash *h, const void *key,
> >> 	/* look for key in primary bucket */
> >> 	ret = search_and_remove(h, key, prim_bkt, short_sig, &pos);
> >> 	if (ret != -1) {
> >> -		__rte_hash_compact_ll(prim_bkt, pos);
> >> +		__rte_hash_compact_ll(h, prim_bkt, pos);
> >> 		last_bkt = prim_bkt->next;
> >> 		prev_bkt = prim_bkt;
> >> 		goto return_bkt;
> >> @@ -1461,7 +1491,7 @@ __rte_hash_del_key_with_hash(const struct
> rte_hash *h, const void *key,
> >> 	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
> >> 		ret = search_and_remove(h, key, cur_bkt, short_sig, &pos);
> >> 		if (ret != -1) {
> >> -			__rte_hash_compact_ll(cur_bkt, pos);
> >> +			__rte_hash_compact_ll(h, cur_bkt, pos);
> >> 			last_bkt = sec_bkt->next;
> >> 			prev_bkt = sec_bkt;
> >> 			goto return_bkt;
> >> @@ -1488,11 +1518,21 @@ __rte_hash_del_key_with_hash(const struct
> rte_hash *h, const void *key,
> >> 	}
> >> 	/* found empty bucket and recycle */
> >> 	if (i == RTE_HASH_BUCKET_ENTRIES) {
> >> -		prev_bkt->next = last_bkt->next = NULL;
> >> +		__atomic_store_n(&prev_bkt->next,
> >> +				 NULL,
> >> +				 __ATOMIC_RELEASE);
> >> 		uint32_t index = last_bkt - h->buckets_ext + 1;
> >> -		rte_ring_sp_enqueue(h->free_ext_bkts, (void
> *)(uintptr_t)index);
> >> -	}
> >> +		if (!h->no_free_on_del)
> >> +			rte_ring_sp_enqueue(h->free_ext_bkts, (void
> *)(uintptr_t)index);
> >> +		else {
> >> +			struct rte_hash_key *key_slot =
> >> +				(struct rte_hash_key *)(
> >> +				(char *)h->key_store +
> >> +				ret * h->key_entry_size);
> >> +			key_slot->ext_bkt_to_free = index;
> > [Wang, Yipeng] Is there chance that a key_slot may already have one
> > previous ext_bkt and now got overwritten, so that the previous one gone
> forever?
> No, it is not possible. Since, the index is being stored in a key_slot which is
> associated with a deleted key.
> >>
> >> +		}
> >> +	}
> >> 	__hash_rw_writer_unlock(h);
> >> 	return ret;
> >> }
> >> @@ -1567,6 +1607,14 @@ rte_hash_free_key_with_position(const struct
> rte_hash *h,
> >> 				(void *)((uintptr_t)position));
> >> 	}
> >>
> >> +	const struct rte_hash_key *key_slot = (const struct rte_hash_key *)(
> >> +						(const char *)h->key_store +
> >> +						position * h->key_entry_size);
> >> +	uint32_t index = key_slot->ext_bkt_to_free;
> >> +	if (!index)
> >> +		/* Recycle empty ext bkt to free list. */
> >> +		rte_ring_sp_enqueue(h->free_ext_bkts, (void
> *)(uintptr_t)index);
Suggest moving this to before freeing the key_index to avoid race conditions.
key_slot->ext_bkt_to_free needs to be set to 0 after freeing.

> >> +
> >> 	return 0;
> >> }
> >>
> >> @@ -1855,6 +1903,9 @@ __rte_hash_lookup_bulk_lf(const struct
> rte_hash *h, const void **keys,
> >> 		rte_prefetch0(secondary_bkt[i]);
> >> 	}
> >>
> >> +	for (i = 0; i < num_keys; i++)
> >> +		positions[i] = -ENOENT;
> > [Wang, Yipeng] So is this for performance reason?
> Resetting positions[] on each iteration of do…while() require ‘hits’ to be reset
> as well, which causes performance hit.
> >> +
> >> 	do {
> >> 		/* Load the table change counter before the lookup
> >> 		 * starts. Acquire semantics will make sure that @@ -1899,7
> +1950,6
> >> @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void
> >> **keys,
> >>
> >> 		/* Compare keys, first hits in primary first */
> >> 		for (i = 0; i < num_keys; i++) {
> >> -			positions[i] = -ENOENT;
> >> 			while (prim_hitmask[i]) {
> >> 				uint32_t hit_index =
> >> 						__builtin_ctzl(prim_hitmask[i])
> @@ -1972,6 +2022,36 @@
> >> __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
> >> 			continue;
> >> 		}
> >>
> >> +
> >> +		/* all found, do not need to go through ext bkt */
> >> +		if (hits == ((1ULL << num_keys) - 1)) {
> >> +			if (hit_mask != NULL)
> >> +				*hit_mask = hits;
> >
> > [Wang, Yipeng] I think you need to check the version counter before
> > return, and how about the fence?
> If all the keys are found, there is no need to check the counter.
> >> +			return;
> >> +		}
> >> +		/* need to check ext buckets for match */
> >> +		if (h->ext_table_support) {
> >> +			for (i = 0; i < num_keys; i++) {
> >> +				if ((hits & (1ULL << i)) != 0)
> >> +					continue;
> >> +				next_bkt = secondary_bkt[i]->next;
> >> +				FOR_EACH_BUCKET(cur_bkt, next_bkt) {
> >> +					if (data != NULL)
> >> +						ret = search_one_bucket_lf(h,
> >> +							keys[i], sig[i],
> >> +							&data[i], cur_bkt);
> >> +					else
> >> +						ret = search_one_bucket_lf(h,
> >> +								keys[i], sig[i],
> >> +								NULL,
> cur_bkt);
> >> +					if (ret != -1) {
> >> +						positions[i] = ret;
> >> +						hits |= 1ULL << i;
> >> +						break;
> >> +					}
> >> +				}
> >> +			}
> >> +		}
> >> 		/* The loads of sig_current in compare_signatures
> >> 		 * should not move below the load from tbl_chng_cnt.
> >> 		 */
> >> @@ -1988,34 +2068,6 @@ __rte_hash_lookup_bulk_lf(const struct
> rte_hash *h, const void **keys,
> >> 					__ATOMIC_ACQUIRE);
> >> 	} while (cnt_b != cnt_a);
> >>
> >> -	/* all found, do not need to go through ext bkt */
> >> -	if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
> >> -		if (hit_mask != NULL)
> >> -			*hit_mask = hits;
> >> -		__hash_rw_reader_unlock(h);
> >> -		return;
> >> -	}
> >> -
> >> -	/* need to check ext buckets for match */
> >> -	for (i = 0; i < num_keys; i++) {
> >> -		if ((hits & (1ULL << i)) != 0)
> >> -			continue;
> >> -		next_bkt = secondary_bkt[i]->next;
> >> -		FOR_EACH_BUCKET(cur_bkt, next_bkt) {
> >> -			if (data != NULL)
> >> -				ret = search_one_bucket_lf(h, keys[i],
> >> -						sig[i], &data[i], cur_bkt);
> >> -			else
> >> -				ret = search_one_bucket_lf(h, keys[i],
> >> -						sig[i], NULL, cur_bkt);
> >> -			if (ret != -1) {
> >> -				positions[i] = ret;
> >> -				hits |= 1ULL << i;
> >> -				break;
> >> -			}
> >> -		}
> >> -	}
> >> -
> >> 	if (hit_mask != NULL)
> >> 		*hit_mask = hits;
> >> }
> >> diff --git a/lib/librte_hash/rte_cuckoo_hash.h
> >> b/lib/librte_hash/rte_cuckoo_hash.h
> >> index eacdaa8d4684..062cc2dd0296 100644
> >> --- a/lib/librte_hash/rte_cuckoo_hash.h
> >> +++ b/lib/librte_hash/rte_cuckoo_hash.h
> >> @@ -129,6 +129,14 @@ struct lcore_cache {
> >>
> >> /* Structure that stores key-value pair */ struct rte_hash_key {
> >> +	/* Stores index of an empty ext bkt to be recycled on calling
> >> +	 * rte_hash_del_xxx APIs. When lock free read-wrie concurrency is
> > [Wang, Yipeng] typo
> Will update it in the next version.
> >> +	 * enabled, an empty ext bkt cannot be put into free list immediately
> >> +	 * (as readers might be using it still). Hence freeing of the ext bkt
> >> +	 * is piggy-backed to freeing of the key index.
> >> +	 */
> > [Wang, Yipeng] I am thinking if this breaks the "guarantee" provided
> > by ext table, Since a whole bucket could not be reused if one key not
> > freed. Is there any fundamental issue with a new API to recycle ext bucket or
> you just do not want to add a new API?
> With lock-free feature, ‘delete’ becomes a two step process of ‘delete’ and
> ‘free’. In other words, it can be viewed by the applications as a 'prolonged
> delete’. I’m not sure how adding a new API to recycle ext bucket will help
> solving the issue.
> >> +	uint32_t ext_bkt_to_free;
> >> +
> >> 	union {
> >> 		uintptr_t idata;
> >> 		void *pdata;
> >> --
> >> 2.17.1

Dharmik Thakkar March 25, 2019, 8:06 p.m. UTC | #4

Hi Honnappa,

Thank you for the review comments!

> On Mar 14, 2019, at 7:31 PM, Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:
> 
> <snip>
> 
>>>> @@ -1072,10 +1071,23 @@ __rte_hash_add_key_with_hash(const struct
>> rte_hash *h, const void *key,
>>>> 	bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
>>>> 	/* Use the first location of the new bucket */
>>>> 	(h->buckets_ext[bkt_id]).sig_current[0] = short_sig;
>>>> -	(h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
>>>> +	/* Key can be of arbitrary length, so it is
>>>> +	 * not possible to store it atomically.
>>>> +	 * Hence the new key element's memory stores
>>>> +	 * (key as well as data) should be complete
>>>> +	 * before it is referenced.
>>>> +	 */
>>>> +	__atomic_store_n(&(h->buckets_ext[bkt_id]).key_idx[0],
>>>> +			 new_idx,
>>>> +			 __ATOMIC_RELEASE);
>>> [Wang, Yipeng] Since it has not been linked and later on the linking
>>> is protected by release, do we really need atomic store here?
>> Atomic store is used here to maintain the code consistency.
> Agree the release order is not required. Removing it does not help much as it is only for the 1st element of a new bucket.
> 
>>> 
>>>> 	/* Link the new bucket to sec bucket linked list */
>>>> 	last = rte_hash_get_last_bkt(sec_bkt);
>>>> -	last->next = &h->buckets_ext[bkt_id];
>>>> +	/* New bucket's memory stores (key as well as data)
>>>> +	 * should be complete before it is referenced
>>>> +	 */
>>>> +	__atomic_store_n(&last->next,
>>>> +			 &h->buckets_ext[bkt_id],
>>>> +			 __ATOMIC_RELEASE);
> The corresponding load-acquire is missing in the 'FOR_EACH_BUCKET' macro
It is not required. Also, this store-release can be removed as there won’t be data race conditions as key_idx and sig are stored atomically. I have updated it in the patch.
> 
>>>> 	__hash_rw_writer_unlock(h);
>>>> 	return new_idx - 1;
>>>> 
>>>> @@ -1366,7 +1378,8 @@ remove_entry(const struct rte_hash *h, struct
>>>> rte_hash_bucket *bkt, unsigned i)
>>>> * empty slot.
>>>> */
>>>> static inline void
>>>> -__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
>>>> +__rte_hash_compact_ll(const struct rte_hash *h,
>>>> +			struct rte_hash_bucket *cur_bkt, int pos) {
>>>> 	int i;
>>>> 	struct rte_hash_bucket *last_bkt;
>>>> 
>>>> @@ -1377,10 +1390,27 @@ __rte_hash_compact_ll(struct
>> rte_hash_bucket
>>>> *cur_bkt, int pos) {
>>>> 
>>>> 	for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) {
>>>> 		if (last_bkt->key_idx[i] != EMPTY_SLOT) {
>>>> -			cur_bkt->key_idx[pos] = last_bkt->key_idx[i];
>>>> 			cur_bkt->sig_current[pos] = last_bkt->sig_current[i];
>>>> +			__atomic_store_n(&cur_bkt->key_idx[pos],
>>>> +					 last_bkt->key_idx[i],
>>>> +					 __ATOMIC_RELEASE);
>>>> +			if (h->readwrite_concur_lf_support) {
>>>> +				/* Inform the readers that the table has
>> changed
>>>> +				 * Since there is one writer, load acquires on
>>>> +				 * tbl_chng_cnt are not required.
>>>> +				 */
>>>> +				__atomic_store_n(h->tbl_chng_cnt,
>>>> +					 *h->tbl_chng_cnt + 1,
>>>> +					 __ATOMIC_RELEASE);
>>>> +				/* The stores to sig_alt and sig_current should
>>>> +				 * not move above the store to tbl_chng_cnt.
>>>> +				 */
>>>> +				__atomic_thread_fence(__ATOMIC_RELEASE);
>>>> +			}
>>>> 			last_bkt->sig_current[i] = NULL_SIGNATURE;
>>>> -			last_bkt->key_idx[i] = EMPTY_SLOT;
>>>> +			__atomic_store_n(&last_bkt->key_idx[i],
>>>> +					 EMPTY_SLOT,
>>>> +					 __ATOMIC_RELEASE);
>>>> 			return;
>>>> 		}
>>>> 	}
>>>> @@ -1449,7 +1479,7 @@ __rte_hash_del_key_with_hash(const struct
>> rte_hash *h, const void *key,
>>>> 	/* look for key in primary bucket */
>>>> 	ret = search_and_remove(h, key, prim_bkt, short_sig, &pos);
>>>> 	if (ret != -1) {
>>>> -		__rte_hash_compact_ll(prim_bkt, pos);
>>>> +		__rte_hash_compact_ll(h, prim_bkt, pos);
>>>> 		last_bkt = prim_bkt->next;
>>>> 		prev_bkt = prim_bkt;
>>>> 		goto return_bkt;
>>>> @@ -1461,7 +1491,7 @@ __rte_hash_del_key_with_hash(const struct
>> rte_hash *h, const void *key,
>>>> 	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
>>>> 		ret = search_and_remove(h, key, cur_bkt, short_sig, &pos);
>>>> 		if (ret != -1) {
>>>> -			__rte_hash_compact_ll(cur_bkt, pos);
>>>> +			__rte_hash_compact_ll(h, cur_bkt, pos);
>>>> 			last_bkt = sec_bkt->next;
>>>> 			prev_bkt = sec_bkt;
>>>> 			goto return_bkt;
>>>> @@ -1488,11 +1518,21 @@ __rte_hash_del_key_with_hash(const struct
>> rte_hash *h, const void *key,
>>>> 	}
>>>> 	/* found empty bucket and recycle */
>>>> 	if (i == RTE_HASH_BUCKET_ENTRIES) {
>>>> -		prev_bkt->next = last_bkt->next = NULL;
>>>> +		__atomic_store_n(&prev_bkt->next,
>>>> +				 NULL,
>>>> +				 __ATOMIC_RELEASE);
>>>> 		uint32_t index = last_bkt - h->buckets_ext + 1;
>>>> -		rte_ring_sp_enqueue(h->free_ext_bkts, (void
>> *)(uintptr_t)index);
>>>> -	}
>>>> +		if (!h->no_free_on_del)
>>>> +			rte_ring_sp_enqueue(h->free_ext_bkts, (void
>> *)(uintptr_t)index);
>>>> +		else {
>>>> +			struct rte_hash_key *key_slot =
>>>> +				(struct rte_hash_key *)(
>>>> +				(char *)h->key_store +
>>>> +				ret * h->key_entry_size);
>>>> +			key_slot->ext_bkt_to_free = index;
>>> [Wang, Yipeng] Is there chance that a key_slot may already have one
>>> previous ext_bkt and now got overwritten, so that the previous one gone
>> forever?
>> No, it is not possible. Since, the index is being stored in a key_slot which is
>> associated with a deleted key.
>>>> 
>>>> +		}
>>>> +	}
>>>> 	__hash_rw_writer_unlock(h);
>>>> 	return ret;
>>>> }
>>>> @@ -1567,6 +1607,14 @@ rte_hash_free_key_with_position(const struct
>> rte_hash *h,
>>>> 				(void *)((uintptr_t)position));
>>>> 	}
>>>> 
>>>> +	const struct rte_hash_key *key_slot = (const struct rte_hash_key *)(
>>>> +						(const char *)h->key_store +
>>>> +						position * h->key_entry_size);
>>>> +	uint32_t index = key_slot->ext_bkt_to_free;
>>>> +	if (!index)
>>>> +		/* Recycle empty ext bkt to free list. */
>>>> +		rte_ring_sp_enqueue(h->free_ext_bkts, (void
>> *)(uintptr_t)index);
> Suggest moving this to before freeing the key_index to avoid race conditions.
> key_slot->ext_bkt_to_free needs to be set to 0 after freeing.
Correct. Updated in the patch.
> 
>>>> +
>>>> 	return 0;
>>>> }
>>>> 
>>>> @@ -1855,6 +1903,9 @@ __rte_hash_lookup_bulk_lf(const struct
>> rte_hash *h, const void **keys,
>>>> 		rte_prefetch0(secondary_bkt[i]);
>>>> 	}
>>>> 
>>>> +	for (i = 0; i < num_keys; i++)
>>>> +		positions[i] = -ENOENT;
>>> [Wang, Yipeng] So is this for performance reason?
>> Resetting positions[] on each iteration of do…while() require ‘hits’ to be reset
>> as well, which causes performance hit.
>>>> +
>>>> 	do {
>>>> 		/* Load the table change counter before the lookup
>>>> 		 * starts. Acquire semantics will make sure that @@ -1899,7
>> +1950,6
>>>> @@ __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void
>>>> **keys,
>>>> 
>>>> 		/* Compare keys, first hits in primary first */
>>>> 		for (i = 0; i < num_keys; i++) {
>>>> -			positions[i] = -ENOENT;
>>>> 			while (prim_hitmask[i]) {
>>>> 				uint32_t hit_index =
>>>> 						__builtin_ctzl(prim_hitmask[i])
>> @@ -1972,6 +2022,36 @@
>>>> __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
>>>> 			continue;
>>>> 		}
>>>> 
>>>> +
>>>> +		/* all found, do not need to go through ext bkt */
>>>> +		if (hits == ((1ULL << num_keys) - 1)) {
>>>> +			if (hit_mask != NULL)
>>>> +				*hit_mask = hits;
>>> 
>>> [Wang, Yipeng] I think you need to check the version counter before
>>> return, and how about the fence?
>> If all the keys are found, there is no need to check the counter.
>>>> +			return;
>>>> +		}
>>>> +		/* need to check ext buckets for match */
>>>> +		if (h->ext_table_support) {
>>>> +			for (i = 0; i < num_keys; i++) {
>>>> +				if ((hits & (1ULL << i)) != 0)
>>>> +					continue;
>>>> +				next_bkt = secondary_bkt[i]->next;
>>>> +				FOR_EACH_BUCKET(cur_bkt, next_bkt) {
>>>> +					if (data != NULL)
>>>> +						ret = search_one_bucket_lf(h,
>>>> +							keys[i], sig[i],
>>>> +							&data[i], cur_bkt);
>>>> +					else
>>>> +						ret = search_one_bucket_lf(h,
>>>> +								keys[i], sig[i],
>>>> +								NULL,
>> cur_bkt);
>>>> +					if (ret != -1) {
>>>> +						positions[i] = ret;
>>>> +						hits |= 1ULL << i;
>>>> +						break;
>>>> +					}
>>>> +				}
>>>> +			}
>>>> +		}
>>>> 		/* The loads of sig_current in compare_signatures
>>>> 		 * should not move below the load from tbl_chng_cnt.
>>>> 		 */
>>>> @@ -1988,34 +2068,6 @@ __rte_hash_lookup_bulk_lf(const struct
>> rte_hash *h, const void **keys,
>>>> 					__ATOMIC_ACQUIRE);
>>>> 	} while (cnt_b != cnt_a);
>>>> 
>>>> -	/* all found, do not need to go through ext bkt */
>>>> -	if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
>>>> -		if (hit_mask != NULL)
>>>> -			*hit_mask = hits;
>>>> -		__hash_rw_reader_unlock(h);
>>>> -		return;
>>>> -	}
>>>> -
>>>> -	/* need to check ext buckets for match */
>>>> -	for (i = 0; i < num_keys; i++) {
>>>> -		if ((hits & (1ULL << i)) != 0)
>>>> -			continue;
>>>> -		next_bkt = secondary_bkt[i]->next;
>>>> -		FOR_EACH_BUCKET(cur_bkt, next_bkt) {
>>>> -			if (data != NULL)
>>>> -				ret = search_one_bucket_lf(h, keys[i],
>>>> -						sig[i], &data[i], cur_bkt);
>>>> -			else
>>>> -				ret = search_one_bucket_lf(h, keys[i],
>>>> -						sig[i], NULL, cur_bkt);
>>>> -			if (ret != -1) {
>>>> -				positions[i] = ret;
>>>> -				hits |= 1ULL << i;
>>>> -				break;
>>>> -			}
>>>> -		}
>>>> -	}
>>>> -
>>>> 	if (hit_mask != NULL)
>>>> 		*hit_mask = hits;
>>>> }
>>>> diff --git a/lib/librte_hash/rte_cuckoo_hash.h
>>>> b/lib/librte_hash/rte_cuckoo_hash.h
>>>> index eacdaa8d4684..062cc2dd0296 100644
>>>> --- a/lib/librte_hash/rte_cuckoo_hash.h
>>>> +++ b/lib/librte_hash/rte_cuckoo_hash.h
>>>> @@ -129,6 +129,14 @@ struct lcore_cache {
>>>> 
>>>> /* Structure that stores key-value pair */ struct rte_hash_key {
>>>> +	/* Stores index of an empty ext bkt to be recycled on calling
>>>> +	 * rte_hash_del_xxx APIs. When lock free read-wrie concurrency is
>>> [Wang, Yipeng] typo
>> Will update it in the next version.
>>>> +	 * enabled, an empty ext bkt cannot be put into free list immediately
>>>> +	 * (as readers might be using it still). Hence freeing of the ext bkt
>>>> +	 * is piggy-backed to freeing of the key index.
>>>> +	 */
>>> [Wang, Yipeng] I am thinking if this breaks the "guarantee" provided
>>> by ext table, Since a whole bucket could not be reused if one key not
>>> freed. Is there any fundamental issue with a new API to recycle ext bucket or
>> you just do not want to add a new API?
>> With lock-free feature, ‘delete’ becomes a two step process of ‘delete’ and
>> ‘free’. In other words, it can be viewed by the applications as a 'prolonged
>> delete’. I’m not sure how adding a new API to recycle ext bucket will help
>> solving the issue.
>>>> +	uint32_t ext_bkt_to_free;
>>>> +
>>>> 	union {
>>>> 		uintptr_t idata;
>>>> 		void *pdata;
>>>> --
>>>> 2.17.1
>

diff mbox series

Patch

diff --git a/doc/guides/prog_guide/hash_lib.rst b/doc/guides/prog_guide/hash_lib.rst
index 85a6edfa8b16..b00446e949ba 100644
--- a/doc/guides/prog_guide/hash_lib.rst
+++ b/doc/guides/prog_guide/hash_lib.rst
@@ -108,8 +108,7 @@  Extendable Bucket Functionality support
 An extra flag is used to enable this functionality (flag is not set by default). When the (RTE_HASH_EXTRA_FLAGS_EXT_TABLE) is set and
 in the very unlikely case due to excessive hash collisions that a key has failed to be inserted, the hash table bucket is extended with a linked
 list to insert these failed keys. This feature is important for the workloads (e.g. telco workloads) that need to insert up to 100% of the
-hash table size and can't tolerate any key insertion failure (even if very few). Currently the extendable bucket is not supported
-with the lock-free concurrency implementation (RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF).
+hash table size and can't tolerate any key insertion failure (even if very few).
 
 
 Implementation Details (non Extendable Bucket Case)
diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c
index c01489ba5193..54762533ce36 100644
--- a/lib/librte_hash/rte_cuckoo_hash.c
+++ b/lib/librte_hash/rte_cuckoo_hash.c
@@ -170,15 +170,6 @@  rte_hash_create(const struct rte_hash_parameters *params)
 		return NULL;
 	}
 
-	if ((params->extra_flag & RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF) &&
-	    (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)) {
-		rte_errno = EINVAL;
-		RTE_LOG(ERR, HASH, "rte_hash_create: extendable bucket "
-			"feature not supported with rw concurrency "
-			"lock free\n");
-		return NULL;
-	}
-
 	/* Check extra flags field to check extra options. */
 	if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT)
 		hw_trans_mem_support = 1;
@@ -1054,7 +1045,15 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
 			/* Check if slot is available */
 			if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) {
 				cur_bkt->sig_current[i] = short_sig;
-				cur_bkt->key_idx[i] = new_idx;
+				/* Key can be of arbitrary length, so it is
+				 * not possible to store it atomically.
+				 * Hence the new key element's memory stores
+				 * (key as well as data) should be complete
+				 * before it is referenced.
+				 */
+				__atomic_store_n(&cur_bkt->key_idx[i],
+						 new_idx,
+						 __ATOMIC_RELEASE);
 				__hash_rw_writer_unlock(h);
 				return new_idx - 1;
 			}
@@ -1072,10 +1071,23 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
 	bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
 	/* Use the first location of the new bucket */
 	(h->buckets_ext[bkt_id]).sig_current[0] = short_sig;
-	(h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
+	/* Key can be of arbitrary length, so it is
+	 * not possible to store it atomically.
+	 * Hence the new key element's memory stores
+	 * (key as well as data) should be complete
+	 * before it is referenced.
+	 */
+	__atomic_store_n(&(h->buckets_ext[bkt_id]).key_idx[0],
+			 new_idx,
+			 __ATOMIC_RELEASE);
 	/* Link the new bucket to sec bucket linked list */
 	last = rte_hash_get_last_bkt(sec_bkt);
-	last->next = &h->buckets_ext[bkt_id];
+	/* New bucket's memory stores (key as well as data)
+	 * should be complete before it is referenced
+	 */
+	__atomic_store_n(&last->next,
+			 &h->buckets_ext[bkt_id],
+			 __ATOMIC_RELEASE);
 	__hash_rw_writer_unlock(h);
 	return new_idx - 1;
 
@@ -1366,7 +1378,8 @@  remove_entry(const struct rte_hash *h, struct rte_hash_bucket *bkt, unsigned i)
  * empty slot.
  */
 static inline void
-__rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
+__rte_hash_compact_ll(const struct rte_hash *h,
+			struct rte_hash_bucket *cur_bkt, int pos) {
 	int i;
 	struct rte_hash_bucket *last_bkt;
 
@@ -1377,10 +1390,27 @@  __rte_hash_compact_ll(struct rte_hash_bucket *cur_bkt, int pos) {
 
 	for (i = RTE_HASH_BUCKET_ENTRIES - 1; i >= 0; i--) {
 		if (last_bkt->key_idx[i] != EMPTY_SLOT) {
-			cur_bkt->key_idx[pos] = last_bkt->key_idx[i];
 			cur_bkt->sig_current[pos] = last_bkt->sig_current[i];
+			__atomic_store_n(&cur_bkt->key_idx[pos],
+					 last_bkt->key_idx[i],
+					 __ATOMIC_RELEASE);
+			if (h->readwrite_concur_lf_support) {
+				/* Inform the readers that the table has changed
+				 * Since there is one writer, load acquires on
+				 * tbl_chng_cnt are not required.
+				 */
+				__atomic_store_n(h->tbl_chng_cnt,
+					 *h->tbl_chng_cnt + 1,
+					 __ATOMIC_RELEASE);
+				/* The stores to sig_alt and sig_current should
+				 * not move above the store to tbl_chng_cnt.
+				 */
+				__atomic_thread_fence(__ATOMIC_RELEASE);
+			}
 			last_bkt->sig_current[i] = NULL_SIGNATURE;
-			last_bkt->key_idx[i] = EMPTY_SLOT;
+			__atomic_store_n(&last_bkt->key_idx[i],
+					 EMPTY_SLOT,
+					 __ATOMIC_RELEASE);
 			return;
 		}
 	}
@@ -1449,7 +1479,7 @@  __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
 	/* look for key in primary bucket */
 	ret = search_and_remove(h, key, prim_bkt, short_sig, &pos);
 	if (ret != -1) {
-		__rte_hash_compact_ll(prim_bkt, pos);
+		__rte_hash_compact_ll(h, prim_bkt, pos);
 		last_bkt = prim_bkt->next;
 		prev_bkt = prim_bkt;
 		goto return_bkt;
@@ -1461,7 +1491,7 @@  __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
 	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
 		ret = search_and_remove(h, key, cur_bkt, short_sig, &pos);
 		if (ret != -1) {
-			__rte_hash_compact_ll(cur_bkt, pos);
+			__rte_hash_compact_ll(h, cur_bkt, pos);
 			last_bkt = sec_bkt->next;
 			prev_bkt = sec_bkt;
 			goto return_bkt;
@@ -1488,11 +1518,21 @@  __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
 	}
 	/* found empty bucket and recycle */
 	if (i == RTE_HASH_BUCKET_ENTRIES) {
-		prev_bkt->next = last_bkt->next = NULL;
+		__atomic_store_n(&prev_bkt->next,
+				 NULL,
+				 __ATOMIC_RELEASE);
 		uint32_t index = last_bkt - h->buckets_ext + 1;
-		rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
-	}
+		if (!h->no_free_on_del)
+			rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
+		else {
+			struct rte_hash_key *key_slot =
+				(struct rte_hash_key *)(
+				(char *)h->key_store +
+				ret * h->key_entry_size);
+			key_slot->ext_bkt_to_free = index;
 
+		}
+	}
 	__hash_rw_writer_unlock(h);
 	return ret;
 }
@@ -1567,6 +1607,14 @@  rte_hash_free_key_with_position(const struct rte_hash *h,
 				(void *)((uintptr_t)position));
 	}
 
+	const struct rte_hash_key *key_slot = (const struct rte_hash_key *)(
+						(const char *)h->key_store +
+						position * h->key_entry_size);
+	uint32_t index = key_slot->ext_bkt_to_free;
+	if (!index)
+		/* Recycle empty ext bkt to free list. */
+		rte_ring_sp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
+
 	return 0;
 }
 
@@ -1855,6 +1903,9 @@  __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
 		rte_prefetch0(secondary_bkt[i]);
 	}
 
+	for (i = 0; i < num_keys; i++)
+		positions[i] = -ENOENT;
+
 	do {
 		/* Load the table change counter before the lookup
 		 * starts. Acquire semantics will make sure that
@@ -1899,7 +1950,6 @@  __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			positions[i] = -ENOENT;
 			while (prim_hitmask[i]) {
 				uint32_t hit_index =
 						__builtin_ctzl(prim_hitmask[i])
@@ -1972,6 +2022,36 @@  __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
+
+		/* all found, do not need to go through ext bkt */
+		if (hits == ((1ULL << num_keys) - 1)) {
+			if (hit_mask != NULL)
+				*hit_mask = hits;
+			return;
+		}
+		/* need to check ext buckets for match */
+		if (h->ext_table_support) {
+			for (i = 0; i < num_keys; i++) {
+				if ((hits & (1ULL << i)) != 0)
+					continue;
+				next_bkt = secondary_bkt[i]->next;
+				FOR_EACH_BUCKET(cur_bkt, next_bkt) {
+					if (data != NULL)
+						ret = search_one_bucket_lf(h,
+							keys[i], sig[i],
+							&data[i], cur_bkt);
+					else
+						ret = search_one_bucket_lf(h,
+								keys[i], sig[i],
+								NULL, cur_bkt);
+					if (ret != -1) {
+						positions[i] = ret;
+						hits |= 1ULL << i;
+						break;
+					}
+				}
+			}
+		}
 		/* The loads of sig_current in compare_signatures
 		 * should not move below the load from tbl_chng_cnt.
 		 */
@@ -1988,34 +2068,6 @@  __rte_hash_lookup_bulk_lf(const struct rte_hash *h, const void **keys,
 					__ATOMIC_ACQUIRE);
 	} while (cnt_b != cnt_a);
 
-	/* all found, do not need to go through ext bkt */
-	if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
-		if (hit_mask != NULL)
-			*hit_mask = hits;
-		__hash_rw_reader_unlock(h);
-		return;
-	}
-
-	/* need to check ext buckets for match */
-	for (i = 0; i < num_keys; i++) {
-		if ((hits & (1ULL << i)) != 0)
-			continue;
-		next_bkt = secondary_bkt[i]->next;
-		FOR_EACH_BUCKET(cur_bkt, next_bkt) {
-			if (data != NULL)
-				ret = search_one_bucket_lf(h, keys[i],
-						sig[i], &data[i], cur_bkt);
-			else
-				ret = search_one_bucket_lf(h, keys[i],
-						sig[i], NULL, cur_bkt);
-			if (ret != -1) {
-				positions[i] = ret;
-				hits |= 1ULL << i;
-				break;
-			}
-		}
-	}
-
 	if (hit_mask != NULL)
 		*hit_mask = hits;
 }
diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h
index eacdaa8d4684..062cc2dd0296 100644
--- a/lib/librte_hash/rte_cuckoo_hash.h
+++ b/lib/librte_hash/rte_cuckoo_hash.h
@@ -129,6 +129,14 @@  struct lcore_cache {
 
 /* Structure that stores key-value pair */
 struct rte_hash_key {
+	/* Stores index of an empty ext bkt to be recycled on calling
+	 * rte_hash_del_xxx APIs. When lock free read-wrie concurrency is
+	 * enabled, an empty ext bkt cannot be put into free list immediately
+	 * (as readers might be using it still). Hence freeing of the ext bkt
+	 * is piggy-backed to freeing of the key index.
+	 */
+	uint32_t ext_bkt_to_free;
+
 	union {
 		uintptr_t idata;
 		void *pdata;