diff --git a/.gitignore b/.gitignore
index 122c321f..eba684ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,8 @@
 .idea
 .vscode
 build
-
+.clwb
+cmake-build-debug/
 docs/build
 docs/source/README.md
 docs/source/CONTRIBUTING.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77190094..34016ca8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,3 +159,8 @@ add_executable(find_with_missed_keys_test tests/find_with_missed_keys_test.cc.cu
 target_compile_features(find_with_missed_keys_test PUBLIC cxx_std_14)
 set_target_properties(find_with_missed_keys_test PROPERTIES  CUDA_ARCHITECTURES OFF)
 TARGET_LINK_LIBRARIES(find_with_missed_keys_test gtest_main)
+
+add_executable(reserved_keys_test tests/reserved_keys_test.cc.cu)
+target_compile_features(reserved_keys_test PUBLIC cxx_std_14)
+set_target_properties(reserved_keys_test PROPERTIES  CUDA_ARCHITECTURES OFF)
+TARGET_LINK_LIBRARIES(reserved_keys_test gtest_main)
\ No newline at end of file
diff --git a/README.md b/README.md
index 798819e3..039a8274 100644
--- a/README.md
+++ b/README.md
@@ -91,19 +91,30 @@ The `score_type` must be `uint64_t`. For more detail, please refer to [`class Ev
 
 It's recommended to keep the default configuration for the options ending with `*`.
 
-| Name                    | Type            | Default | Description                                           |
-|:------------------------|:----------------|:--------|:------------------------------------------------------|
-| __init_capacity__       | size_t          | 0       | The initial capacity of the hash table.               |
-| __max_capacity__        | size_t          | 0       | The maximum capacity of the hash table.               |
-| __max_hbm_for_vectors__ | size_t          | 0       | The maximum HBM for vectors, in bytes.                |
-| __dim__                 | size_t          | 64      | The dimension of the value vectors.                   |
-| __max_bucket_size*__    | size_t          | 128     | The length of each bucket.                            |
-| __max_load_factor*__    | float           | 0.5f    | The max load factor before rehashing.                 |
-| __block_size*__         | int             | 128     | The default block size for CUDA kernels.              |
-| __io_block_size*__      | int             | 1024    | The block size for IO CUDA kernels.                   |
-| __device_id*__          | int             | -1      | The ID of device. Managed internally when set to `-1` |
-| __io_by_cpu*__          | bool            | false   | The flag indicating if the CPU handles IO.            |
-
+| Name                       | Type   | Default | Description                                           |
+|:---------------------------|:-------|:--------|:------------------------------------------------------|
+| __init_capacity__          | size_t | 0       | The initial capacity of the hash table.               |
+| __max_capacity__           | size_t | 0       | The maximum capacity of the hash table.               |
+| __max_hbm_for_vectors__    | size_t | 0       | The maximum HBM for vectors, in bytes.                |
+| __dim__                    | size_t | 64      | The dimension of the value vectors.                   |
+| __max_bucket_size*__       | size_t | 128     | The length of each bucket.                            |
+| __max_load_factor*__       | float  | 0.5f    | The max load factor before rehashing.                 |
+| __block_size*__            | int    | 128     | The default block size for CUDA kernels.              |
+| __io_block_size*__         | int    | 1024    | The block size for IO CUDA kernels.                   |
+| __device_id*__             | int    | -1      | The ID of device. Managed internally when set to `-1` |
+| __io_by_cpu*__             | bool   | false   | The flag indicating if the CPU handles IO.            |
+| __reserved_key_start_bit__ | int    | 0       | The start bit offset of reserved key in the 64 bit    |
+
+#### Reserved Keys
+- The keys of `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using.
+- Call set options.reserved_key_start_bit to change the reserved keys if the default one conflicted with your keys.
+  The valid range of reserved_key_start_bit is [0, 62] and the default value is 0, meaning the default reserved keys.
+  reserved_key_start_bit = 1 means using the insignificant bits 1 and 2 as the keys as the reserved keys, 
+  in binary format, it looks like 111~11xx0, and the index 0 bit is 0 and all the other bits are positive, in this case the new reserved keys are
+  `FFFFFFFFFFFFFFFE`, `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFF8`, and `0xFFFFFFFFFFFFFFFA`
+  reserved_key_start_bit = 2, in binary format, it looks like 111~11xx10, bit offset 0 are always 0 for any reserved_key_start_bit != 0,
+- if you change the reserved_key_start_bit, you should use same value for save/load
+  For more detail, please refer to [`init_reserved_keys`](https://github.com/search?q=repo%3ANVIDIA-Merlin%2FHierarchicalKV%20init_reserved_keys&type=code).
 For more detail, please refer to [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60).
 
 ### How to use:
@@ -142,13 +153,10 @@ int main(int argc, char *argv[])
 }
 
 ```
-
 ### Usage restrictions
 
 - The `key_type` must be `int64_t` or `uint64_t`.
 - The `score_type` must be `uint64_t`.
-- The keys of `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using.
-
 ## Contributors
 
 HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users,
@@ -172,6 +180,11 @@ cd HierarchicalKV && mkdir -p build && cd build
 cmake -DCMAKE_BUILD_TYPE=Release -Dsm=80 .. && make -j
 ```
 
+For Debug:
+```shell
+cmake -DCMAKE_BUILD_TYPE=Debug -Dsm=80 .. && make -j
+```
+
 For Benchmark:
 ```shell
 ./merlin_hashtable_benchmark
diff --git a/include/merlin/core_kernels.cuh b/include/merlin/core_kernels.cuh
index d8a85002..4bca3173 100644
--- a/include/merlin/core_kernels.cuh
+++ b/include/merlin/core_kernels.cuh
@@ -640,7 +640,7 @@ __global__ void remove_kernel(const Table<K, V, S>* __restrict table,
        t += blockDim.x * gridDim.x) {
     int key_idx = t / TILE_SIZE;
     K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_key)) continue;
 
     int key_pos = -1;
 
@@ -719,7 +719,7 @@ __global__ void remove_kernel(const Table<K, V, S>* __restrict table,
           bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed);
       current_score =
           bucket->scores(key_offset)->load(cuda::std::memory_order_relaxed);
-      if (!IS_RESERVED_KEY(current_key)) {
+      if (!IS_RESERVED_KEY<K>(current_key)) {
         if (pred(current_key, current_score, pattern, threshold)) {
           atomicAdd(count, 1);
           key_pos = key_offset;
@@ -782,7 +782,7 @@ __global__ void dump_kernel(const Table<K, V, S>* __restrict table,
     const int key_idx{static_cast<int>((tid + offset) % bucket_max_size)};
     const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       size_t local_index{atomicAdd(&block_acc, 1)};
       block_tuples[local_index] = {
           key, &bucket->vectors[key_idx * dim],
@@ -846,7 +846,7 @@ __global__ void dump_kernel(const Table<K, V, S>* __restrict table,
         (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
     S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);
 
-    if (!IS_RESERVED_KEY(key) && pred(key, score, pattern, threshold)) {
+    if (!IS_RESERVED_KEY<K>(key) && pred(key, score, pattern, threshold)) {
       size_t local_index = atomicAdd(&block_acc, 1);
       block_result_key[local_index] = key;
       for (int i = 0; i < dim; i++) {
diff --git a/include/merlin/core_kernels/accum_or_assign.cuh b/include/merlin/core_kernels/accum_or_assign.cuh
index 7f557f59..fbabb0fd 100644
--- a/include/merlin/core_kernels/accum_or_assign.cuh
+++ b/include/merlin/core_kernels/accum_or_assign.cuh
@@ -98,7 +98,7 @@ __global__ void accum_or_assign_kernel_with_io(
 
     const K insert_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(insert_key)) continue;
 
     const S insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
@@ -222,7 +222,7 @@ __global__ void accum_or_assign_kernel(
 
     const K insert_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(insert_key)) continue;
 
     const S insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
diff --git a/include/merlin/core_kernels/contains.cuh b/include/merlin/core_kernels/contains.cuh
index 1da4a4b8..a545e00e 100644
--- a/include/merlin/core_kernels/contains.cuh
+++ b/include/merlin/core_kernels/contains.cuh
@@ -253,7 +253,7 @@ __global__ void contains_kernel(const Table<K, V, S>* __restrict table,
     int key_idx = t / TILE_SIZE;
 
     const K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_key)) continue;
 
     int key_pos = -1;
     int src_lane = -1;
diff --git a/include/merlin/core_kernels/find_or_insert.cuh b/include/merlin/core_kernels/find_or_insert.cuh
index d0668841..654a230b 100644
--- a/include/merlin/core_kernels/find_or_insert.cuh
+++ b/include/merlin/core_kernels/find_or_insert.cuh
@@ -53,7 +53,7 @@ __global__ void tlp_v1_find_or_insert_kernel_with_io(
     key = keys[kv_idx];
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -272,7 +272,7 @@ __global__ void tlp_v2_find_or_insert_kernel_with_io(
     key = keys[kv_idx];
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -621,7 +621,7 @@ __global__ void pipeline_find_or_insert_kernel_with_io(
       S* sm_param_scores = SMM::param_scores(smem);
       __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));
     }
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -1326,7 +1326,7 @@ __global__ void find_or_insert_kernel_with_io(
 
     const K find_or_insert_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;
 
     const S find_or_insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
@@ -1463,7 +1463,7 @@ __global__ void find_or_insert_kernel_lock_key_hybrid(
 
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -1713,7 +1713,7 @@ __global__ void find_or_insert_kernel(
 
     const K find_or_insert_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;
 
     const S find_or_insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
diff --git a/include/merlin/core_kernels/find_ptr_or_insert.cuh b/include/merlin/core_kernels/find_ptr_or_insert.cuh
index 2e5d8ff7..87472619 100644
--- a/include/merlin/core_kernels/find_ptr_or_insert.cuh
+++ b/include/merlin/core_kernels/find_ptr_or_insert.cuh
@@ -53,7 +53,7 @@ __global__ void find_or_insert_ptr_kernel_lock_key(
     key = keys[kv_idx];
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -283,7 +283,7 @@ __global__ void find_ptr_or_insert_kernel(
 
     const K find_or_insert_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;
 
     const S find_or_insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
diff --git a/include/merlin/core_kernels/lookup.cuh b/include/merlin/core_kernels/lookup.cuh
index 8310460f..9c443659 100644
--- a/include/merlin/core_kernels/lookup.cuh
+++ b/include/merlin/core_kernels/lookup.cuh
@@ -885,7 +885,7 @@ __global__ void lookup_kernel_with_io(
     int key_idx = t / TILE_SIZE;
 
     const K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_key)) continue;
 
     V* find_value = values + key_idx * dim;
 
@@ -1015,7 +1015,7 @@ __device__ void tlp_lookup_kernel_hybrid_impl(
   if (kv_idx < n) {
     key = keys[kv_idx];
     if (dst_offset) dst_offset[kv_idx] = kv_idx;
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -1140,7 +1140,7 @@ __device__ void lookup_kernel_impl(
     int key_idx = t / TILE_SIZE;
 
     const K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_key)) continue;
 
     int key_pos = -1;
     int src_lane = -1;
diff --git a/include/merlin/core_kernels/lookup_ptr.cuh b/include/merlin/core_kernels/lookup_ptr.cuh
index cfa5ea56..ce57b0b9 100644
--- a/include/merlin/core_kernels/lookup_ptr.cuh
+++ b/include/merlin/core_kernels/lookup_ptr.cuh
@@ -41,7 +41,7 @@ __global__ void tlp_lookup_ptr_kernel_with_filter(
   uint32_t key_pos = {0};
   if (kv_idx < n) {
     key = keys[kv_idx];
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -140,7 +140,7 @@ __global__ void lookup_ptr_kernel(const Table<K, V, S>* __restrict table,
     int key_idx = t / TILE_SIZE;
 
     const K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
+    if (IS_RESERVED_KEY<K>(find_key)) continue;
 
     int key_pos = -1;
     int src_lane = -1;
diff --git a/include/merlin/core_kernels/update.cuh b/include/merlin/core_kernels/update.cuh
index 731e312b..69bcd250 100644
--- a/include/merlin/core_kernels/update.cuh
+++ b/include/merlin/core_kernels/update.cuh
@@ -44,7 +44,7 @@ __global__ void tlp_update_kernel_with_io(
   if (kv_idx < n) {
     key = keys[kv_idx];
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -680,7 +680,7 @@ __global__ void update_kernel_with_io(
 
     const K update_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(update_key)) continue;
+    if (IS_RESERVED_KEY<K>(update_key)) continue;
 
     const V* update_value = values + key_idx * dim;
 
@@ -773,7 +773,7 @@ __global__ void tlp_update_kernel_hybrid(
   if (kv_idx < n) {
     key = keys[kv_idx];
     if (src_offset) src_offset[kv_idx] = kv_idx;
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -878,7 +878,7 @@ __global__ void update_kernel(const Table<K, V, S>* __restrict table,
 
     const K update_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(update_key)) continue;
+    if (IS_RESERVED_KEY<K>(update_key)) continue;
 
     size_t bkt_idx = 0;
     size_t start_idx = 0;
diff --git a/include/merlin/core_kernels/update_score.cuh b/include/merlin/core_kernels/update_score.cuh
index fc24a745..9f4460c3 100644
--- a/include/merlin/core_kernels/update_score.cuh
+++ b/include/merlin/core_kernels/update_score.cuh
@@ -43,7 +43,7 @@ __global__ void tlp_update_score_kernel(Bucket<K, V, S>* __restrict__ buckets,
   if (kv_idx < n) {
     key = keys[kv_idx];
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -549,7 +549,7 @@ __global__ void update_score_kernel(const Table<K, V, S>* __restrict table,
 
     const K update_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(update_key)) continue;
+    if (IS_RESERVED_KEY<K>(update_key)) continue;
 
     size_t bkt_idx = 0;
     size_t start_idx = 0;
diff --git a/include/merlin/core_kernels/update_values.cuh b/include/merlin/core_kernels/update_values.cuh
index c2e3d17f..298d3bb9 100644
--- a/include/merlin/core_kernels/update_values.cuh
+++ b/include/merlin/core_kernels/update_values.cuh
@@ -42,7 +42,7 @@ __global__ void tlp_update_values_kernel_with_io(
   if (kv_idx < n) {
     key = keys[kv_idx];
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -638,7 +638,7 @@ __global__ void update_values_kernel_with_io(
 
     const K update_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(update_key)) continue;
+    if (IS_RESERVED_KEY<K>(update_key)) continue;
 
     const V* update_value = values + key_idx * dim;
 
@@ -724,7 +724,7 @@ __global__ void tlp_update_values_kernel_hybrid(
   if (kv_idx < n) {
     key = keys[kv_idx];
     if (src_offset) src_offset[kv_idx] = kv_idx;
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -823,7 +823,7 @@ __global__ void update_values_kernel(const Table<K, V, S>* __restrict table,
 
     const K update_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(update_key)) continue;
+    if (IS_RESERVED_KEY<K>(update_key)) continue;
 
     size_t bkt_idx = 0;
     size_t start_idx = 0;
diff --git a/include/merlin/core_kernels/upsert.cuh b/include/merlin/core_kernels/upsert.cuh
index 37e505a1..c955c210 100644
--- a/include/merlin/core_kernels/upsert.cuh
+++ b/include/merlin/core_kernels/upsert.cuh
@@ -53,7 +53,7 @@ __global__ void tlp_v1_upsert_kernel_with_io(
     key = keys[kv_idx];
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -268,7 +268,7 @@ __global__ void tlp_v2_upsert_kernel_with_io(
     key = keys[kv_idx];
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
 
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -596,7 +596,7 @@ __global__ void pipeline_upsert_kernel_with_io(
       S* sm_param_scores = SMM::param_scores(smem);
       __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));
     }
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -1247,7 +1247,9 @@ __global__ void upsert_kernel_with_io_core(
 
     const K insert_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(insert_key)) {
+      continue;
+    }
 
     const S insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
@@ -1280,7 +1282,9 @@ __global__ void upsert_kernel_with_io_core(
       occupy_result = g.shfl(occupy_result, src_lane);
     } while (occupy_result == OccupyResult::CONTINUE);
 
-    if (occupy_result == OccupyResult::REFUSED) continue;
+    if (occupy_result == OccupyResult::REFUSED) {
+      continue;
+    }
 
     if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
          occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
@@ -1377,7 +1381,7 @@ __global__ void upsert_kernel_lock_key_hybrid(
     }
 
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -1612,7 +1616,7 @@ __global__ void upsert_kernel(const Table<K, V, S>* __restrict table,
     size_t key_idx = t / TILE_SIZE;
 
     const K insert_key = keys[key_idx];
-    if (IS_RESERVED_KEY(insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(insert_key)) continue;
 
     const S insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
diff --git a/include/merlin/core_kernels/upsert_and_evict.cuh b/include/merlin/core_kernels/upsert_and_evict.cuh
index c685dca7..d43af108 100644
--- a/include/merlin/core_kernels/upsert_and_evict.cuh
+++ b/include/merlin/core_kernels/upsert_and_evict.cuh
@@ -56,7 +56,7 @@ __global__ void tlp_v1_upsert_and_evict_kernel_unique(
   if (kv_idx < n) {
     key = keys[kv_idx];
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -285,7 +285,7 @@ __global__ void tlp_v2_upsert_and_evict_kernel_unique(
   if (kv_idx < n) {
     key = keys[kv_idx];
     score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -639,7 +639,7 @@ __global__ void pipeline_upsert_and_evict_kernel_unique(
       S* sm_param_scores = SMM::param_scores(smem);
       __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));
     }
-    if (!IS_RESERVED_KEY(key)) {
+    if (!IS_RESERVED_KEY<K>(key)) {
       const K hashed_key = Murmur3HashDevice(key);
       target_digests = digests_from_hashed<K>(hashed_key);
       uint64_t global_idx =
@@ -1397,7 +1397,7 @@ __global__ void upsert_and_evict_kernel_with_io_core(
 
     const K insert_key = keys[key_idx];
 
-    if (IS_RESERVED_KEY(insert_key)) continue;
+    if (IS_RESERVED_KEY<K>(insert_key)) continue;
 
     const S insert_score =
         ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
diff --git a/include/merlin/types.cuh b/include/merlin/types.cuh
index 176aa208..c8ac799f 100644
--- a/include/merlin/types.cuh
+++ b/include/merlin/types.cuh
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <cuda/std/semaphore>
 #include <vector>
+#include "debug.hpp"
 
 namespace nv {
 namespace merlin {
@@ -43,18 +44,70 @@ using byte = uint8_t;
 
 // Digest.
 using D = byte;
+constexpr uint64_t DEFAULT_EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF);
+constexpr uint64_t DEFAULT_RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE);
+constexpr uint64_t DEFAULT_LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD);
+
+constexpr uint64_t DEFAULT_RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC);
+constexpr uint64_t DEFAULT_VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE);
 
-constexpr uint64_t EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF);
-constexpr uint64_t RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE);
-constexpr uint64_t VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE);
-constexpr uint64_t LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD);
-constexpr uint64_t RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC);
 constexpr uint64_t MAX_SCORE = UINT64_C(0xFFFFFFFFFFFFFFFF);
 constexpr uint64_t EMPTY_SCORE = UINT64_C(0);
 constexpr uint64_t IGNORED_GLOBAL_EPOCH = UINT64_C(0xFFFFFFFFFFFFFFFF);
 
-#define IS_RESERVED_KEY(key) ((RESERVED_KEY_MASK & (key)) == RESERVED_KEY_MASK)
-#define IS_VACANT_KEY(key) ((VACANT_KEY_MASK & (key)) == VACANT_KEY_MASK)
+uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY;
+__constant__ uint64_t EMPTY_KEY = DEFAULT_EMPTY_KEY;
+__constant__ uint64_t RECLAIM_KEY = DEFAULT_RECLAIM_KEY;
+__constant__ uint64_t LOCKED_KEY = DEFAULT_LOCKED_KEY;
+
+__constant__ uint64_t RESERVED_KEY_MASK_1 = DEFAULT_RESERVED_KEY_MASK;
+__constant__ uint64_t RESERVED_KEY_MASK_2 = DEFAULT_RESERVED_KEY_MASK;
+__constant__ uint64_t VACANT_KEY_MASK_1 = DEFAULT_VACANT_KEY_MASK;
+__constant__ uint64_t VACANT_KEY_MASK_2 = DEFAULT_VACANT_KEY_MASK;
+
+constexpr int MAX_RESERVED_KEY_BIT = 62;
+
+template <class K>
+__forceinline__ __device__ bool IS_RESERVED_KEY(K key) {
+  return (RESERVED_KEY_MASK_1 & key) == RESERVED_KEY_MASK_2;
+}
+
+template <class K>
+__forceinline__ __device__ bool IS_VACANT_KEY(K key) {
+  return (VACANT_KEY_MASK_1 & key) == VACANT_KEY_MASK_2;
+}
+
+cudaError_t init_reserved_keys(int index) {
+  if (index < 1 || index > MAX_RESERVED_KEY_BIT) {
+    // index = 0 is the default,
+    // index = 62 is the maximum index can be set for reserved keys.
+    return cudaSuccess;
+  }
+  uint64_t reservedKeyMask1 = ~(UINT64_C(3) << index);
+  uint64_t reservedKeyMask2 = reservedKeyMask1 & ~UINT64_C(1);
+  uint64_t vacantKeyMask1 = ~(UINT64_C(1) << index);
+  uint64_t vacantKeyMask2 = vacantKeyMask1 & ~UINT64_C(1);
+  
+  uint64_t emptyKey = reservedKeyMask2 | (UINT64_C(3) << index);
+  uint64_t reclaimKey = vacantKeyMask2;
+  uint64_t lockedKey = emptyKey & ~(UINT64_C(2) << index);
+  EMPTY_KEY_CPU = emptyKey;
+
+//  printf("reserved keys are emptyKey, reclaimKey, lockedKey and reservedKeyMask2\n");
+//  printf("emptyKey: %lx, reclaimKey: %lx\n", emptyKey, reclaimKey);
+//  printf("lockedKey: %lx, reservedKeyMask1: %lx\n", lockedKey, reservedKeyMask1);
+//  printf("reservedKeyMask2: %lx, vacantKeyMask1: %lx\n", reservedKeyMask2, vacantKeyMask1);
+
+  CUDA_CHECK(cudaMemcpyToSymbol(EMPTY_KEY, &emptyKey, sizeof(uint64_t)));
+  CUDA_CHECK(cudaMemcpyToSymbol(RECLAIM_KEY, &reclaimKey, sizeof(uint64_t)));
+  CUDA_CHECK(cudaMemcpyToSymbol(LOCKED_KEY, &lockedKey, sizeof(uint64_t)));
+
+  CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_1, &reservedKeyMask1, sizeof(uint64_t)));
+  CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_2, &reservedKeyMask2, sizeof(uint64_t)));
+  CUDA_CHECK(cudaMemcpyToSymbol(VACANT_KEY_MASK_1, &vacantKeyMask1, sizeof(uint64_t)));
+  CUDA_CHECK(cudaMemcpyToSymbol(VACANT_KEY_MASK_2, &vacantKeyMask2, sizeof(uint64_t)));
+  return cudaGetLastError();
+}
 
 template <class K>
 using AtomicKey = cuda::atomic<K, cuda::thread_scope_device>;
diff --git a/include/merlin/utils.cuh b/include/merlin/utils.cuh
index 316f12bf..09499152 100644
--- a/include/merlin/utils.cuh
+++ b/include/merlin/utils.cuh
@@ -314,6 +314,20 @@ inline void free_pointers(cudaStream_t stream, int n, ...) {
   va_end(args);
 }
 
+__global__ void memset64bitKernel(void* devPtr, uint64_t value, size_t count) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < count) {
+    static_cast<uint64_t*>(devPtr)[idx] = value;
+  }
+}
+
+__forceinline__ __host__ cudaError_t memset64Async(void* devPtr, uint64_t value, size_t count, cudaStream_t stream = 0) {
+  int blockSize = 256;
+  int numBlocks = (count + blockSize - 1) / blockSize;
+  memset64bitKernel<<<numBlocks, blockSize, 0, stream>>>(devPtr, value, count);
+  return cudaGetLastError();
+}
+
 #define CUDA_FREE_POINTERS(stream, ...) \
   nv::merlin::free_pointers(            \
       stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__);
diff --git a/include/merlin_hashtable.cuh b/include/merlin_hashtable.cuh
index 9586f16a..ba435007 100644
--- a/include/merlin_hashtable.cuh
+++ b/include/merlin_hashtable.cuh
@@ -98,6 +98,17 @@ struct HashTableOptions {
   int device_id = -1;              ///< The ID of device.
   bool io_by_cpu = false;  ///< The flag indicating if the CPU handles IO.
   bool use_constant_memory = false;  ///< reserved
+  /*
+   * reserved_key_start_bit = 0, is the default behavior, HKV reserves `0xFFFFFFFFFFFFFFFD`,
+   * `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF`  for internal using.
+   * if the default one conflicted with your keys, change the reserved_key_start_bit
+   * value to a numbers between 1 and 62, reserved_key_start_bit = 1 means using the
+   * insignificant bits index 1 and 2 as the keys as the reserved keys and
+   * the index 0 bit is 0 and all the other bits are 1, the new reserved keys are
+   * `FFFFFFFFFFFFFFFE`, `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFF8`, and `0xFFFFFFFFFFFFFFFA`
+   * the console log prints the reserved keys during the table initialization.
+   */
+  int reserved_key_start_bit = 0; ///< The binary index of reserved key.
   MemoryPoolOptions
       device_memory_pool;  ///< Configuration options for device memory pool.
   MemoryPoolOptions
@@ -865,6 +876,10 @@ class HashTable : public HashTableBase<K, V, S> {
       return;
     }
     options_ = options;
+    MERLIN_CHECK(options.reserved_key_start_bit >= 0 &&
+                     options.reserved_key_start_bit <= MAX_RESERVED_KEY_BIT,
+                 "options.reserved_key_start_bit should >= 0 and <= 62.");
+    CUDA_CHECK(init_reserved_keys(options.reserved_key_start_bit));
 
     default_allocator_ = (allocator == nullptr);
     allocator_ = (allocator == nullptr) ? (new DefaultAllocator()) : allocator;
@@ -1103,8 +1118,6 @@ class HashTable : public HashTableBase<K, V, S> {
    * shape (n, DIM).
    * @param scores The scores to insert on GPU-accessible memory with shape
    * (n).
-   * @param scores The scores to insert on GPU-accessible memory with shape
-   * (n).
    * @params evicted_keys The output of keys replaced with minimum score.
    * @params evicted_values The output of values replaced with minimum score on
    * keys.
@@ -1198,8 +1211,7 @@ class HashTable : public HashTableBase<K, V, S> {
 
       size_type block_size = options_.block_size;
       size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size);
-      CUDA_CHECK(cudaMemsetAsync(evicted_keys, static_cast<int>(EMPTY_KEY),
-                                 n * sizeof(K), stream));
+      CUDA_CHECK(memset64Async(evicted_keys, EMPTY_KEY_CPU, n, stream));
       using Selector =
           SelectUpsertAndEvictKernelWithIO<key_type, value_type, score_type,
                                            evict_strategy>;
@@ -1236,8 +1248,6 @@ class HashTable : public HashTableBase<K, V, S> {
    * shape (n, DIM).
    * @param scores The scores to insert on GPU-accessible memory with shape
    * (n).
-   * @param scores The scores to insert on GPU-accessible memory with shape
-   * (n).
    * @params evicted_keys The output of keys replaced with minimum score.
    * @params evicted_values The output of values replaced with minimum score on
    * keys.
diff --git a/tests/accum_or_assign_test.cc.cu b/tests/accum_or_assign_test.cc.cu
index 6182bde1..4f0d776f 100644
--- a/tests/accum_or_assign_test.cc.cu
+++ b/tests/accum_or_assign_test.cc.cu
@@ -52,7 +52,7 @@ struct ExportIfPredFunctor {
   }
 };
 
-void test_basic_when_full(size_t max_hbm_for_vectors) {
+void test_basic_when_full(size_t max_hbm_for_vectors, int key_start) {
   constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
@@ -65,7 +65,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -162,7 +162,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_erase_if_pred(size_t max_hbm_for_vectors) {
+void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start) {
   constexpr uint64_t INIT_CAPACITY = 256UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 128UL;
@@ -176,7 +176,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) {
   bool* h_accum_or_assigns;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -301,7 +301,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_rehash(size_t max_hbm_for_vectors) {
+void test_rehash(size_t max_hbm_for_vectors, int key_start) {
   constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
   constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
   constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
@@ -316,7 +316,7 @@ void test_rehash(size_t max_hbm_for_vectors) {
   bool* h_accum_or_assigns;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -452,7 +452,7 @@ void test_rehash(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
+void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start) {
   constexpr uint64_t INIT_CAPACITY = 1024;
   constexpr uint64_t MAX_CAPACITY = 16 * 1024;
   constexpr uint64_t INIT_KEY_NUM = 1024;
@@ -468,7 +468,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
   float true_ratio = 0.6f;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -759,7 +759,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
 //  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
 //}
 //
-void test_export_batch_if(size_t max_hbm_for_vectors) {
+void test_export_batch_if(size_t max_hbm_for_vectors, int key_start) {
   constexpr uint64_t INIT_CAPACITY = 256UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 128UL;
@@ -773,7 +773,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) {
   size_t h_dump_counter = 0;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1140,7 +1140,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1152,7 +1152,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   constexpr float true_ratio = 0.5;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1364,7 +1364,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr int RSHIFT_ON_NANO = 20;
 
   constexpr uint64_t BUCKET_NUM = 8UL;
@@ -1379,6 +1379,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1590,7 +1591,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1603,6 +1604,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1854,7 +1856,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1867,6 +1869,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2112,7 +2115,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -2126,6 +2129,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2365,7 +2369,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
   constexpr uint64_t STEPS = 128;
   constexpr uint64_t MAX_BUCKET_SIZE = 128;
@@ -2378,6 +2382,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2500,7 +2505,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
+void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
   constexpr uint64_t INIT_CAPACITY = 4 * 1024 - BUCKET_MAX_SIZE - 1;
   constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
@@ -2511,6 +2516,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2864,27 +2870,27 @@ void test_accum_or_assign_values_check(size_t max_hbm_for_vectors) {
 }
 
 TEST(AccumOrAssignTest, test_export_batch_if) {
-  test_export_batch_if(16);
-  test_export_batch_if(0);
+  test_export_batch_if(16, 22);
+  test_export_batch_if(0, 0);
 }
 TEST(AccumOrAssignTest, test_basic_when_full) {
-  test_basic_when_full(16);
-  test_basic_when_full(0);
+  test_basic_when_full(16, 2);
+  test_basic_when_full(0, 0);
 }
 TEST(AccumOrAssignTest, test_erase_if_pred) {
-  test_erase_if_pred(16);
-  test_erase_if_pred(0);
+  test_erase_if_pred(16, 0);
+  test_erase_if_pred(0, 5);
 }
 TEST(AccumOrAssignTest, test_rehash) {
-  test_rehash(16);
-  test_rehash(0);
+  test_rehash(16, 7);
+  test_rehash(0, 0);
 }
 TEST(AccumOrAssignTest, test_rehash_on_big_batch) {
-  test_rehash_on_big_batch(16);
-  test_rehash_on_big_batch(0);
+  test_rehash_on_big_batch(16, 9);
+  test_rehash_on_big_batch(0, 0);
 }
 TEST(AccumOrAssignTest, test_dynamic_rehash_on_multi_threads) {
-  test_dynamic_rehash_on_multi_threads(16);
+  test_dynamic_rehash_on_multi_threads(16, 56);
   test_dynamic_rehash_on_multi_threads(0);
 }
 TEST(AccumOrAssignTest, test_evict_strategy_lru_basic) {
@@ -2892,32 +2898,32 @@ TEST(AccumOrAssignTest, test_evict_strategy_lru_basic) {
   test_evict_strategy_lru_basic(0);
 }
 TEST(AccumOrAssignTest, test_evict_strategy_lfu_basic) {
-  test_evict_strategy_lfu_basic(16);
+  test_evict_strategy_lfu_basic(16, 3);
   // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
   // test_evict_strategy_lfu_basic(0);
 }
 
 TEST(AccumOrAssignTest, test_evict_strategy_epochlru_basic) {
-  test_evict_strategy_epochlru_basic(16);
+  test_evict_strategy_epochlru_basic(16, 33);
   test_evict_strategy_epochlru_basic(0);
 }
 
 TEST(AccumOrAssignTest, test_evict_strategy_epochlfu_basic) {
   test_evict_strategy_epochlfu_basic(16);
-  test_evict_strategy_epochlfu_basic(0);
+  test_evict_strategy_epochlfu_basic(0, 44);
 }
 TEST(AccumOrAssignTest, test_evict_strategy_customized_basic) {
   test_evict_strategy_customized_basic(16);
-  test_evict_strategy_customized_basic(0);
+  test_evict_strategy_customized_basic(0, 23);
 }
 TEST(AccumOrAssignTest, test_evict_strategy_customized_advanced) {
-  test_evict_strategy_customized_advanced(16);
+  test_evict_strategy_customized_advanced(16, 16);
   test_evict_strategy_customized_advanced(0);
 }
 TEST(AccumOrAssignTest, test_evict_strategy_customized_correct_rate) {
   // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
   const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
-  test_evict_strategy_customized_correct_rate(16);
+  test_evict_strategy_customized_correct_rate(16, 61);
   if (!skip_hmem_check) {
     test_evict_strategy_customized_correct_rate(0);
   } else {
diff --git a/tests/assign_score_test.cc.cu b/tests/assign_score_test.cc.cu
index abde3924..87241158 100644
--- a/tests/assign_score_test.cc.cu
+++ b/tests/assign_score_test.cc.cu
@@ -38,7 +38,7 @@ using S = uint64_t;
 using EvictStrategy = nv::merlin::EvictStrategy;
 using TableOptions = nv::merlin::HashTableOptions;
 
-void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -50,6 +50,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -217,7 +218,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -229,6 +230,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -392,7 +394,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr int RSHIFT_ON_NANO = 20;
 
   constexpr uint64_t BUCKET_NUM = 8UL;
@@ -406,6 +408,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -580,7 +583,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -592,6 +595,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -792,7 +796,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -804,6 +808,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -962,7 +967,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -974,6 +979,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1500,7 +1506,7 @@ void test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {
   }
 }
 
-void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
   constexpr uint64_t STEPS = 128;
   constexpr uint64_t MAX_BUCKET_SIZE = 128;
@@ -1512,6 +1518,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1776,26 +1783,26 @@ void test_find_or_insert_values_check(size_t max_hbm_for_vectors) {
 
 TEST(AssignScoreTest, test_evict_strategy_lru_basic) {
   test_evict_strategy_lru_basic(16);
-  test_evict_strategy_lru_basic(0);
+  test_evict_strategy_lru_basic(0, 34);
 }
 TEST(AssignScoreTest, test_evict_strategy_lfu_basic) {
   test_evict_strategy_lfu_basic(16);
-  test_evict_strategy_lfu_basic(0);
+  test_evict_strategy_lfu_basic(0, 2);
 }
 TEST(AssignScoreTest, test_evict_strategy_epochlru_basic) {
-  test_evict_strategy_epochlru_basic(16);
+  test_evict_strategy_epochlru_basic(16, 51);
   test_evict_strategy_epochlru_basic(0);
 }
 TEST(AssignScoreTest, test_evict_strategy_epochlfu_basic) {
-  test_evict_strategy_epochlfu_basic(16);
+  test_evict_strategy_epochlfu_basic(16, 4);
   test_evict_strategy_epochlfu_basic(0);
 }
 TEST(AssignScoreTest, test_evict_strategy_customized_basic) {
   test_evict_strategy_customized_basic(16);
-  test_evict_strategy_customized_basic(0);
+  test_evict_strategy_customized_basic(0, 11);
 }
 TEST(AssignScoreTest, test_evict_strategy_customized_advanced) {
-  test_evict_strategy_customized_advanced(16);
+  test_evict_strategy_customized_advanced(16, 33);
   test_evict_strategy_customized_advanced(0);
 }
 TEST(AssignScoreTest, test_assign_advanced_on_epochlfu) {
@@ -1804,7 +1811,7 @@ TEST(AssignScoreTest, test_assign_advanced_on_epochlfu) {
 TEST(AssignScoreTest, test_evict_strategy_customized_correct_rate) {
   // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
   const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
-  test_evict_strategy_customized_correct_rate(16);
+  test_evict_strategy_customized_correct_rate(16, 44);
   if (!skip_hmem_check) {
     test_evict_strategy_customized_correct_rate(0);
   } else {
diff --git a/tests/assign_values_test.cc.cu b/tests/assign_values_test.cc.cu
index c721addf..07eaf1e4 100644
--- a/tests/assign_values_test.cc.cu
+++ b/tests/assign_values_test.cc.cu
@@ -37,7 +37,7 @@ using S = uint64_t;
 using EvictStrategy = nv::merlin::EvictStrategy;
 using TableOptions = nv::merlin::HashTableOptions;
 
-void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -49,6 +49,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -202,7 +203,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start = 0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -214,6 +215,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -735,12 +737,12 @@ void test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {
 }
 
 TEST(AssignValuesTest, test_evict_strategy_lru_basic) {
-  test_evict_strategy_lru_basic(16);
+  test_evict_strategy_lru_basic(16, 21);
   test_evict_strategy_lru_basic(0);
 }
 TEST(AssignValuesTest, test_evict_strategy_epochlfu_basic) {
   test_evict_strategy_epochlfu_basic(16);
-  test_evict_strategy_epochlfu_basic(0);
+  test_evict_strategy_epochlfu_basic(0, 8);
 }
 TEST(AssignValuesTest, test_assign_advanced_on_epochlfu) {
   test_assign_advanced_on_epochlfu(16);
diff --git a/tests/find_or_insert_ptr_test.cc.cu b/tests/find_or_insert_ptr_test.cc.cu
index 1588a699..f82b18aa 100644
--- a/tests/find_or_insert_ptr_test.cc.cu
+++ b/tests/find_or_insert_ptr_test.cc.cu
@@ -56,7 +56,7 @@ struct ExportIfPredFunctor {
   }
 };
 
-void test_basic(size_t max_hbm_for_vectors) {
+void test_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
@@ -69,6 +69,7 @@ void test_basic(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -331,7 +332,7 @@ void test_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_basic_when_full(size_t max_hbm_for_vectors) {
+void test_basic_when_full(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
@@ -344,6 +345,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) {
 
   TableOptions options;
 
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -463,7 +465,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_erase_if_pred(size_t max_hbm_for_vectors) {
+void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 256UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 128UL;
@@ -476,7 +478,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -614,7 +616,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_rehash(size_t max_hbm_for_vectors) {
+void test_rehash(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
   constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
   constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
@@ -626,7 +628,7 @@ void test_rehash(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -771,7 +773,7 @@ void test_rehash(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
+void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 1024;
   constexpr uint64_t MAX_CAPACITY = 16 * 1024;
   constexpr uint64_t INIT_KEY_NUM = 1024;
@@ -782,7 +784,6 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -945,7 +946,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
+void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
   constexpr uint64_t INIT_CAPACITY = 4 * 1024;
   constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
@@ -955,7 +956,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
   std::vector<std::thread> threads;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1089,7 +1090,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
   ASSERT_EQ(table->capacity(), MAX_CAPACITY);
 }
 
-void test_export_batch_if(size_t max_hbm_for_vectors) {
+void test_export_batch_if(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 256UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 128UL;
@@ -1102,7 +1103,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) {
   size_t h_dump_counter = 0;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1286,7 +1287,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_basic_for_cpu_io() {
+void test_basic_for_cpu_io(int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
@@ -1298,7 +1299,7 @@ void test_basic_for_cpu_io() {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1464,7 +1465,7 @@ void test_basic_for_cpu_io() {
   CudaCheckError();
 }
 
-void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1475,7 +1476,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1673,7 +1674,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1684,7 +1685,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1872,7 +1873,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr int RSHIFT_ON_NANO = 20;
 
   constexpr uint64_t BUCKET_NUM = 8UL;
@@ -1885,7 +1886,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2092,7 +2093,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -2103,7 +2104,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2333,7 +2334,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -2344,7 +2345,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2532,7 +2533,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -2543,7 +2544,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 256;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2760,7 +2761,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
   constexpr uint64_t STEPS = 128;
   constexpr uint64_t MAX_BUCKET_SIZE = 128;
@@ -2771,7 +2772,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
   const int rounds = 12;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -3443,7 +3444,7 @@ void test_find_or_insert_values_check(size_t max_hbm_for_vectors) {
   }
 }
 
-void test_duplicated_keys(size_t max_hbm_for_vectors) {
+void test_duplicated_keys(size_t max_hbm_for_vectors, size_t key_start = 0) {
   constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 1024UL;
@@ -3455,7 +3456,7 @@ void test_duplicated_keys(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -3542,7 +3543,7 @@ void test_duplicated_keys(size_t max_hbm_for_vectors) {
 
 TEST(FindOrInsertPtrTest, test_export_batch_if) {
   test_export_batch_if(16);
-  test_export_batch_if(0);
+  test_export_batch_if(0, 33);
 }
 TEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) {
   test_find_or_insert_multi_threads(16, 0.25f, 0.125f);
@@ -3551,58 +3552,61 @@ TEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) {
   test_find_or_insert_multi_threads(0, 0.375f, 0.125f);
 }
 TEST(FindOrInsertPtrTest, test_basic) {
-  test_basic(16);
+  test_basic(16, 3);
   test_basic(0);
 }
 TEST(FindOrInsertPtrTest, test_basic_when_full) {
-  test_basic_when_full(16);
+  test_basic_when_full(16, 4);
   test_basic_when_full(0);
 }
 TEST(FindOrInsertPtrTest, test_erase_if_pred) {
   test_erase_if_pred(16);
-  test_erase_if_pred(0);
+  test_erase_if_pred(0, 18);
 }
 TEST(FindOrInsertPtrTest, test_rehash) {
   test_rehash(16);
-  test_rehash(0);
+  test_rehash(0, 44);
 }
 TEST(FindOrInsertPtrTest, test_rehash_on_big_batch) {
-  test_rehash_on_big_batch(16);
+  test_rehash_on_big_batch(16,23);
   test_rehash_on_big_batch(0);
 }
 TEST(FindOrInsertPtrTest, test_dynamic_rehash_on_multi_threads) {
   test_dynamic_rehash_on_multi_threads(16);
-  test_dynamic_rehash_on_multi_threads(0);
+  test_dynamic_rehash_on_multi_threads(0, 19);
+}
+TEST(FindOrInsertPtrTest, test_basic_for_cpu_io) {
+  test_basic_for_cpu_io();
+  test_basic_for_cpu_io(52);
 }
-TEST(FindOrInsertPtrTest, test_basic_for_cpu_io) { test_basic_for_cpu_io(); }
 TEST(FindOrInsertPtrTest, test_evict_strategy_lru_basic) {
   test_evict_strategy_lru_basic(16);
-  test_evict_strategy_lru_basic(0);
+  test_evict_strategy_lru_basic(0, 18);
 }
 TEST(FindOrInsertPtrTest, test_evict_strategy_lfu_basic) {
-  test_evict_strategy_lfu_basic(16);
+  test_evict_strategy_lfu_basic(16, 29);
   test_evict_strategy_lfu_basic(0);
 }
 TEST(FindOrInsertPtrTest, test_evict_strategy_epochlru_basic) {
-  test_evict_strategy_epochlru_basic(16);
+  test_evict_strategy_epochlru_basic(16, 45);
   test_evict_strategy_epochlru_basic(0);
 }
 TEST(FindOrInsertPtrTest, test_evict_strategy_epochlfu_basic) {
   test_evict_strategy_epochlfu_basic(16);
-  test_evict_strategy_epochlfu_basic(0);
+  test_evict_strategy_epochlfu_basic(0, 59);
 }
 TEST(FindOrInsertPtrTest, test_evict_strategy_customized_basic) {
-  test_evict_strategy_customized_basic(16);
+  test_evict_strategy_customized_basic(16, 38);
   test_evict_strategy_customized_basic(0);
 }
 TEST(FindOrInsertPtrTest, test_evict_strategy_customized_advanced) {
   test_evict_strategy_customized_advanced(16);
-  test_evict_strategy_customized_advanced(0);
+  test_evict_strategy_customized_advanced(0, 25);
 }
 TEST(FindOrInsertPtrTest, test_evict_strategy_customized_correct_rate) {
   // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
   const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
-  test_evict_strategy_customized_correct_rate(16);
+  test_evict_strategy_customized_correct_rate(16, 16);
   if (!skip_hmem_check) {
     test_evict_strategy_customized_correct_rate(0);
   } else {
@@ -3615,6 +3619,6 @@ TEST(FindOrInsertPtrTest, test_find_or_insert_values_check) {
   test_find_or_insert_values_check(0);
 }
 TEST(FindOrInsertPtrTest, test_duplicated_keys) {
-  test_duplicated_keys(16);
+  test_duplicated_keys(16, 39);
   test_duplicated_keys(0);
 }
diff --git a/tests/find_or_insert_test.cc.cu b/tests/find_or_insert_test.cc.cu
index 9ee01cb8..1aa96d7a 100644
--- a/tests/find_or_insert_test.cc.cu
+++ b/tests/find_or_insert_test.cc.cu
@@ -56,7 +56,7 @@ struct ExportIfPredFunctor {
   }
 };
 
-void test_basic(size_t max_hbm_for_vectors) {
+void test_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
@@ -68,7 +68,7 @@ void test_basic(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -300,7 +300,7 @@ void test_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_basic_when_full(size_t max_hbm_for_vectors) {
+void test_basic_when_full(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = INIT_CAPACITY;
@@ -312,7 +312,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -423,7 +423,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_erase_if_pred(size_t max_hbm_for_vectors) {
+void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 256UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 128UL;
@@ -436,7 +436,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -559,7 +559,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_rehash(size_t max_hbm_for_vectors) {
+void test_rehash(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
   constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
   constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
@@ -571,7 +571,7 @@ void test_rehash(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -701,7 +701,7 @@ void test_rehash(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
+void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 1024;
   constexpr uint64_t MAX_CAPACITY = 16 * 1024;
   constexpr uint64_t INIT_KEY_NUM = 1024;
@@ -712,7 +712,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -845,7 +845,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
+void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
   constexpr uint64_t INIT_CAPACITY = 4 * 1024;
   constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
@@ -855,7 +855,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
   std::vector<std::thread> threads;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -974,7 +974,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
   ASSERT_EQ(table->capacity(), MAX_CAPACITY);
 }
 
-void test_export_batch_if(size_t max_hbm_for_vectors) {
+void test_export_batch_if(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 256UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 128UL;
@@ -987,7 +987,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) {
   size_t h_dump_counter = 0;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1156,7 +1156,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_basic_for_cpu_io() {
+void test_basic_for_cpu_io(int key_start=0) {
   constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
   constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
   constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
@@ -1168,7 +1168,7 @@ void test_basic_for_cpu_io() {
   bool* h_found;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1305,7 +1305,7 @@ void test_basic_for_cpu_io() {
   CudaCheckError();
 }
 
-void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1316,7 +1316,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1482,7 +1482,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1493,7 +1493,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1655,7 +1655,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr int RSHIFT_ON_NANO = 20;
 
   constexpr uint64_t BUCKET_NUM = 8UL;
@@ -1668,7 +1668,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -1843,7 +1843,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -1854,7 +1854,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2056,7 +2056,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -2067,7 +2067,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 128;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -2227,7 +2227,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
   CudaCheckError();
 }
 
-void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
+void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start=0) {
   constexpr uint64_t BUCKET_NUM = 8UL;
   constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
   constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
@@ -2238,7 +2238,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
   constexpr uint64_t TEST_TIMES = 256;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -3594,7 +3594,7 @@ void test_find_or_insert_values_check(size_t max_hbm_for_vectors) {
 
 TEST(FindOrInsertTest, test_export_batch_if) {
   test_export_batch_if(16);
-  test_export_batch_if(0);
+  test_export_batch_if(0, 31);
 }
 TEST(FindOrInsertTest, test_find_or_insert_multi_threads) {
   test_find_or_insert_multi_threads(16, 0.25f, 0.125f);
@@ -3623,52 +3623,55 @@ TEST(FindOrInsertTest, test_value_type_hbm_mode) {
   test_value_type_hbm_mode<double, 63>();
 }
 TEST(FindOrInsertTest, test_basic) {
-  test_basic(16);
+  test_basic(16, 61);
   test_basic(0);
 }
 TEST(FindOrInsertTest, test_basic_when_full) {
   test_basic_when_full(16);
-  test_basic_when_full(0);
+  test_basic_when_full(0, 41);
 }
 TEST(FindOrInsertTest, test_erase_if_pred) {
   test_erase_if_pred(16);
-  test_erase_if_pred(0);
+  test_erase_if_pred(0, 17);
 }
 TEST(FindOrInsertTest, test_rehash) {
   test_rehash(16);
-  test_rehash(0);
+  test_rehash(0, 22);
 }
 TEST(FindOrInsertTest, test_rehash_on_big_batch) {
-  test_rehash_on_big_batch(16);
+  test_rehash_on_big_batch(16, 37);
   test_rehash_on_big_batch(0);
 }
 TEST(FindOrInsertTest, test_dynamic_rehash_on_multi_threads) {
-  test_dynamic_rehash_on_multi_threads(16);
+  test_dynamic_rehash_on_multi_threads(16, 22);
   test_dynamic_rehash_on_multi_threads(0);
 }
-TEST(FindOrInsertTest, test_basic_for_cpu_io) { test_basic_for_cpu_io(); }
+TEST(FindOrInsertTest, test_basic_for_cpu_io) {
+  test_basic_for_cpu_io(45);
+  test_basic_for_cpu_io();
+}
 TEST(FindOrInsertTest, test_evict_strategy_lru_basic) {
   test_evict_strategy_lru_basic(16);
-  test_evict_strategy_lru_basic(0);
+  test_evict_strategy_lru_basic(0, 44);
 }
 TEST(FindOrInsertTest, test_evict_strategy_lfu_basic) {
-  test_evict_strategy_lfu_basic(16);
+  test_evict_strategy_lfu_basic(16, 34);
   test_evict_strategy_lfu_basic(0);
 }
 TEST(FindOrInsertTest, test_evict_strategy_epochlru_basic) {
-  test_evict_strategy_epochlru_basic(16);
+  test_evict_strategy_epochlru_basic(16, 41);
   test_evict_strategy_epochlru_basic(0);
 }
 TEST(FindOrInsertTest, test_evict_strategy_epochlfu_basic) {
-  test_evict_strategy_epochlfu_basic(16);
+  test_evict_strategy_epochlfu_basic(16, 42);
   test_evict_strategy_epochlfu_basic(0);
 }
 TEST(FindOrInsertTest, test_evict_strategy_customized_basic) {
   test_evict_strategy_customized_basic(16);
-  test_evict_strategy_customized_basic(0);
+  test_evict_strategy_customized_basic(0, 43);
 }
 TEST(FindOrInsertTest, test_evict_strategy_customized_advanced) {
-  test_evict_strategy_customized_advanced(16);
+  test_evict_strategy_customized_advanced(16, 54);
   test_evict_strategy_customized_advanced(0);
 }
 TEST(FindOrInsertTest, test_assign_advanced_on_epochlfu) {
diff --git a/tests/find_with_missed_keys_test.cc.cu b/tests/find_with_missed_keys_test.cc.cu
index 7b93761c..29b6e22f 100644
--- a/tests/find_with_missed_keys_test.cc.cu
+++ b/tests/find_with_missed_keys_test.cc.cu
@@ -30,7 +30,7 @@ using EvictStrategy = nv::merlin::EvictStrategy;
 using TableOptions = nv::merlin::HashTableOptions;
 
 void test_find(size_t max_hbm_for_vectors, size_t max_bucket_size,
-               double load_factor, bool pipeline_lookup) {
+               double load_factor, bool pipeline_lookup, int key_start = 0) {
   MERLIN_CHECK(load_factor >= 0.0 && load_factor <= 1.0,
                "Invalid `load_factor`");
 
@@ -46,7 +46,7 @@ void test_find(size_t max_hbm_for_vectors, size_t max_bucket_size,
   int* h_missed_indices;
 
   TableOptions options;
-
+  options.reserved_key_start_bit = key_start;
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
@@ -157,14 +157,14 @@ void test_find(size_t max_hbm_for_vectors, size_t max_bucket_size,
 
 TEST(FindTest, test_find_when_empty) {
   // pure HMEM
-  test_find(0, 128, 0.0, true);
+  test_find(0, 128, 0.0, true, 12);
   test_find(0, 256, 0.0, false);
   // hybrid
-  test_find(32, 128, 0.0, true);
+  test_find(32, 128, 0.0, true, 58);
   test_find(32, 256, 0.0, false);
   // pure HBM
   test_find(1024, 128, 0.0, true);
-  test_find(1024, 256, 0.0, false);
+  test_find(1024, 256, 0.0, false, 12);
 }
 
 TEST(FindTest, test_find_when_full) {
@@ -173,7 +173,7 @@ TEST(FindTest, test_find_when_full) {
   test_find(0, 256, 1.0, false);
   // hybrid
   test_find(32, 128, 1.0, true);
-  test_find(32, 256, 1.0, false);
+  test_find(32, 256, 1.0, false, 60);
   // pure HBM
   test_find(1024, 128, 1.0, true);
   test_find(1024, 256, 1.0, false);
@@ -181,32 +181,33 @@ TEST(FindTest, test_find_when_full) {
 
 TEST(FindTest, test_find_load_factor) {
   // pure HMEM
-  test_find(0, 128, 0.2, true);
-  test_find(0, 256, 0.2, false);
+  test_find(0, 128, 0.2, true, 45);
+  test_find(0, 256, 0.2, false, 12);
   // hybrid
-  test_find(32, 128, 0.2, true);
-  test_find(32, 256, 0.2, false);
+  test_find(32, 128, 0.2, true, 27);
+  test_find(32, 256, 0.2, false, 53);
   // pure HBM
-  test_find(1024, 128, 0.2, true);
-  test_find(1024, 256, 0.2, false);
+  test_find(1024, 128, 0.2, true, 9);
+  test_find(1024, 256, 0.2, false, 38);
 
   // pure HMEM
-  test_find(0, 128, 0.5, true);
-  test_find(0, 256, 0.5, false);
+  test_find(0, 128, 0.5, true, 21);
+  test_find(0, 256, 0.5, false, 46);
   // hybrid
-  test_find(32, 128, 0.5, true);
-  test_find(32, 256, 0.5, false);
+  test_find(32, 128, 0.5, true, 31);
+  test_find(32, 256, 0.5, false, 59);
   // pure HBM
-  test_find(1024, 128, 0.5, true);
-  test_find(1024, 256, 0.5, false);
+  test_find(1024, 128, 0.5, true, 4);
+  test_find(1024, 256, 0.5, false, 22);
 
   // pure HMEM
-  test_find(0, 128, 0.75, true);
-  test_find(0, 256, 0.75, false);
+  test_find(0, 128, 0.75, true, 11);
+  test_find(0, 256, 0.75, false, 34);
   // hybrid
-  test_find(32, 128, 0.75, true);
-  test_find(32, 256, 0.75, false);
+  test_find(32, 128, 0.75, true, 18);
+  test_find(32, 256, 0.75, false, 47);
   // pure HBM
-  test_find(1024, 128, 0.75, true);
-  test_find(1024, 256, 0.75, false);
+  test_find(1024, 128, 0.75, true, 7);
+  test_find(1024, 256, 0.75, false, 29);
+
 }
diff --git a/tests/merlin_hashtable_test.cc.cu b/tests/merlin_hashtable_test.cc.cu
index ed80be3c..9492592c 100644
--- a/tests/merlin_hashtable_test.cc.cu
+++ b/tests/merlin_hashtable_test.cc.cu
@@ -145,6 +145,8 @@ void test_basic(size_t max_hbm_for_vectors) {
   options.dim = DIM;
   options.max_bucket_size = BUCKET_MAX_SIZE;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
+  options.reserved_key_start_bit = 2;
+
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
 
   CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
@@ -512,6 +514,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 3;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
 
@@ -641,6 +644,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 4;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
 
@@ -775,6 +779,7 @@ void test_rehash(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 5;
   options.max_bucket_size = BUCKET_MAX_SIZE;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
@@ -916,6 +921,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 6;
   options.max_bucket_size = 128;
   options.max_load_factor = 0.6;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
@@ -1059,6 +1065,7 @@ void test_rehash_on_big_batch_specific(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 7;
   options.max_bucket_size = 128;
   options.max_load_factor = 0.6;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
@@ -1129,6 +1136,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 8;
   options.max_load_factor = 0.50f;
   options.max_bucket_size = BUCKET_MAX_SIZE;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
@@ -1264,6 +1272,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 9;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
 
@@ -1447,6 +1456,7 @@ void test_basic_for_cpu_io() {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 10;
   options.max_hbm_for_vectors = nv::merlin::GB(0);
   options.io_by_cpu = true;
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
@@ -1595,6 +1605,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 11;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
 
@@ -1771,6 +1782,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 12;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;
 
@@ -1948,6 +1960,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 13;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;
 
@@ -2133,6 +2146,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 14;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;
 
@@ -2344,6 +2358,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 15;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
 
@@ -2513,6 +2528,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 16;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
 
@@ -2712,6 +2728,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
   options.init_capacity = INIT_CAPACITY;
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
+  options.reserved_key_start_bit = 17;
   options.max_bucket_size = MAX_BUCKET_SIZE;
   options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
@@ -3327,6 +3344,7 @@ void test_bucket_size(bool load_scores = true) {
   options.max_capacity = MAX_CAPACITY;
   options.dim = DIM;
   options.max_hbm_for_vectors = nv::merlin::GB(16);
+  options.reserved_key_start_bit = 1;
   using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
 
   CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
diff --git a/tests/reserved_keys_test.cc.cu b/tests/reserved_keys_test.cc.cu
new file mode 100644
index 00000000..a69589c8
--- /dev/null
+++ b/tests/reserved_keys_test.cc.cu
@@ -0,0 +1,104 @@
+/*
+* Copyright (c) 2024, NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <cstdint>
+#include <gtest/gtest.h>
+#include "merlin/types.cuh"
+#include "test_util.cuh"
+#include "merlin/utils.cuh"
+
+
+using namespace nv::merlin;
+
+__global__ void testReservedKeysKernel(uint64_t* keys, bool* results, size_t numKeys) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < numKeys) {
+    results[idx] = IS_RESERVED_KEY(keys[idx]);
+  }
+}
+
+void testCustomMemsetAsync() {
+  size_t numElements = 4;
+  uint64_t value = 0xFFFFFFFFFFFFFFF1;
+  uint64_t* devPtr;
+  uint64_t* hostData = new uint64_t[numElements];
+
+  cudaMalloc((void**)&devPtr, numElements * sizeof(uint64_t));
+  memset64Async(devPtr, value, numElements);
+  cudaMemcpy(hostData, devPtr, numElements * sizeof(uint64_t), cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < numElements; i++) {
+    assert(hostData[i] == value);
+  }
+
+  std::cout << "All values were set correctly!" << std::endl;
+
+  cudaFree(devPtr);
+  delete[] hostData;
+}
+
+void testReservedKeys(uint64_t* testKeys, bool* expectedResults, size_t numKeys) {
+  uint64_t* d_keys;
+  bool* d_results;
+  bool* h_results = new bool[numKeys];
+
+  cudaMalloc(&d_keys, numKeys * sizeof(uint64_t));
+  cudaMalloc(&d_results, numKeys * sizeof(bool));
+
+  cudaMemcpy(d_keys, testKeys, numKeys * sizeof(uint64_t), cudaMemcpyHostToDevice);
+
+  int blockSize = 256;
+  int numBlocks = (numKeys + blockSize - 1) / blockSize;
+
+  testReservedKeysKernel<<<numBlocks, blockSize>>>(d_keys, d_results, numKeys);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(h_results, d_results, numKeys * sizeof(bool), cudaMemcpyDeviceToHost);
+
+  for (size_t i = 0; i < numKeys; i++) {
+    assert(h_results[i] == expectedResults[i]);
+  }
+
+  cudaFree(d_keys);
+  cudaFree(d_results);
+  delete[] h_results;
+  CudaCheckError();
+  std::cout << "All tests passed." << std::endl;
+}
+
+void testKeyOptions() {
+  for (int i = 0; i <= MAX_RESERVED_KEY_BIT; i++) {
+    CUDA_CHECK(init_reserved_keys(i));
+    uint64_t host_reclaim_key, host_locked_key;
+    cudaMemcpyFromSymbol(&host_reclaim_key, RECLAIM_KEY, sizeof(uint64_t));
+    cudaMemcpyFromSymbol(&host_locked_key, LOCKED_KEY, sizeof(uint64_t));
+
+    uint64_t testKeys[6] = {
+        EMPTY_KEY_CPU, host_reclaim_key, host_locked_key,
+        UINT64_C(0x0), UINT64_C(0x10),
+        DEFAULT_EMPTY_KEY
+    };
+    bool expectedResults[6] = {
+        true, true, true, false, false,
+        (i == 0)? true : false
+    };
+    testReservedKeys(testKeys, expectedResults, 4);
+  }
+}
+
+TEST(ReservedKeysTest, testKeyOptions) {
+  testKeyOptions();
+  testCustomMemsetAsync();
+}
\ No newline at end of file