diff --git a/.gitignore b/.gitignore index 122c321f..eba684ea 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,8 @@ .idea .vscode build - +.clwb +cmake-build-debug/ docs/build docs/source/README.md docs/source/CONTRIBUTING.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 77190094..34016ca8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,3 +159,8 @@ add_executable(find_with_missed_keys_test tests/find_with_missed_keys_test.cc.cu target_compile_features(find_with_missed_keys_test PUBLIC cxx_std_14) set_target_properties(find_with_missed_keys_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(find_with_missed_keys_test gtest_main) + +add_executable(reserved_keys_test tests/reserved_keys_test.cc.cu) +target_compile_features(reserved_keys_test PUBLIC cxx_std_14) +set_target_properties(reserved_keys_test PROPERTIES CUDA_ARCHITECTURES OFF) +TARGET_LINK_LIBRARIES(reserved_keys_test gtest_main) \ No newline at end of file diff --git a/README.md b/README.md index 798819e3..039a8274 100644 --- a/README.md +++ b/README.md @@ -91,19 +91,30 @@ The `score_type` must be `uint64_t`. For more detail, please refer to [`class Ev It's recommended to keep the default configuration for the options ending with `*`. -| Name | Type | Default | Description | -|:------------------------|:----------------|:--------|:------------------------------------------------------| -| __init_capacity__ | size_t | 0 | The initial capacity of the hash table. | -| __max_capacity__ | size_t | 0 | The maximum capacity of the hash table. | -| __max_hbm_for_vectors__ | size_t | 0 | The maximum HBM for vectors, in bytes. | -| __dim__ | size_t | 64 | The dimension of the value vectors. | -| __max_bucket_size*__ | size_t | 128 | The length of each bucket. | -| __max_load_factor*__ | float | 0.5f | The max load factor before rehashing. | -| __block_size*__ | int | 128 | The default block size for CUDA kernels. | -| __io_block_size*__ | int | 1024 | The block size for IO CUDA kernels. | -| __device_id*__ | int | -1 | The ID of device. Managed internally when set to `-1` | -| __io_by_cpu*__ | bool | false | The flag indicating if the CPU handles IO. | - +| Name | Type | Default | Description | +|:---------------------------|:-------|:--------|:------------------------------------------------------| +| __init_capacity__ | size_t | 0 | The initial capacity of the hash table. | +| __max_capacity__ | size_t | 0 | The maximum capacity of the hash table. | +| __max_hbm_for_vectors__ | size_t | 0 | The maximum HBM for vectors, in bytes. | +| __dim__ | size_t | 64 | The dimension of the value vectors. | +| __max_bucket_size*__ | size_t | 128 | The length of each bucket. | +| __max_load_factor*__ | float | 0.5f | The max load factor before rehashing. | +| __block_size*__ | int | 128 | The default block size for CUDA kernels. | +| __io_block_size*__ | int | 1024 | The block size for IO CUDA kernels. | +| __device_id*__ | int | -1 | The ID of device. Managed internally when set to `-1` | +| __io_by_cpu*__ | bool | false | The flag indicating if the CPU handles IO. | +| __reserved_key_start_bit__ | int | 0 | The start bit offset of reserved key in the 64 bit | + +#### Reserved Keys +- The keys of `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using. +- Call set options.reserved_key_start_bit to change the reserved keys if the default one conflicted with your keys. + The valid range of reserved_key_start_bit is [0, 62] and the default value is 0, meaning the default reserved keys. + reserved_key_start_bit = 1 means using the insignificant bits 1 and 2 as the keys as the reserved keys, + in binary format, it looks like 111~11xx0, and the index 0 bit is 0 and all the other bits are positive, in this case the new reserved keys are + `FFFFFFFFFFFFFFFE`, `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFF8`, and `0xFFFFFFFFFFFFFFFA` + reserved_key_start_bit = 2, in binary format, it looks like 111~11xx10, bit offset 0 are always 0 for any reserved_key_start_bit != 0, +- if you change the reserved_key_start_bit, you should use same value for save/load + For more detail, please refer to [`init_reserved_keys`](https://github.com/search?q=repo%3ANVIDIA-Merlin%2FHierarchicalKV%20init_reserved_keys&type=code). For more detail, please refer to [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60). ### How to use: @@ -142,13 +153,10 @@ int main(int argc, char *argv[]) } ``` - ### Usage restrictions - The `key_type` must be `int64_t` or `uint64_t`. - The `score_type` must be `uint64_t`. -- The keys of `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using. - ## Contributors HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users, @@ -172,6 +180,11 @@ cd HierarchicalKV && mkdir -p build && cd build cmake -DCMAKE_BUILD_TYPE=Release -Dsm=80 .. && make -j ``` +For Debug: +```shell +cmake -DCMAKE_BUILD_TYPE=Debug -Dsm=80 .. && make -j +``` + For Benchmark: ```shell ./merlin_hashtable_benchmark diff --git a/include/merlin/core_kernels.cuh b/include/merlin/core_kernels.cuh index d8a85002..4bca3173 100644 --- a/include/merlin/core_kernels.cuh +++ b/include/merlin/core_kernels.cuh @@ -640,7 +640,7 @@ __global__ void remove_kernel(const Table* __restrict table, t += blockDim.x * gridDim.x) { int key_idx = t / TILE_SIZE; K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; + if (IS_RESERVED_KEY(find_key)) continue; int key_pos = -1; @@ -719,7 +719,7 @@ __global__ void remove_kernel(const Table* __restrict table, bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed); current_score = bucket->scores(key_offset)->load(cuda::std::memory_order_relaxed); - if (!IS_RESERVED_KEY(current_key)) { + if (!IS_RESERVED_KEY(current_key)) { if (pred(current_key, current_score, pattern, threshold)) { atomicAdd(count, 1); key_pos = key_offset; @@ -782,7 +782,7 @@ __global__ void dump_kernel(const Table* __restrict table, const int key_idx{static_cast((tid + offset) % bucket_max_size)}; const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)}; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { size_t local_index{atomicAdd(&block_acc, 1)}; block_tuples[local_index] = { key, &bucket->vectors[key_idx * dim], @@ -846,7 +846,7 @@ __global__ void dump_kernel(const Table* __restrict table, (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed); S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed); - if (!IS_RESERVED_KEY(key) && pred(key, score, pattern, threshold)) { + if (!IS_RESERVED_KEY(key) && pred(key, score, pattern, threshold)) { size_t local_index = atomicAdd(&block_acc, 1); block_result_key[local_index] = key; for (int i = 0; i < dim; i++) { diff --git a/include/merlin/core_kernels/accum_or_assign.cuh b/include/merlin/core_kernels/accum_or_assign.cuh index 7f557f59..fbabb0fd 100644 --- a/include/merlin/core_kernels/accum_or_assign.cuh +++ b/include/merlin/core_kernels/accum_or_assign.cuh @@ -98,7 +98,7 @@ __global__ void accum_or_assign_kernel_with_io( const K insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(insert_key)) continue; + if (IS_RESERVED_KEY(insert_key)) continue; const S insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); @@ -222,7 +222,7 @@ __global__ void accum_or_assign_kernel( const K insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(insert_key)) continue; + if (IS_RESERVED_KEY(insert_key)) continue; const S insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); diff --git a/include/merlin/core_kernels/contains.cuh b/include/merlin/core_kernels/contains.cuh index 1da4a4b8..a545e00e 100644 --- a/include/merlin/core_kernels/contains.cuh +++ b/include/merlin/core_kernels/contains.cuh @@ -253,7 +253,7 @@ __global__ void contains_kernel(const Table* __restrict table, int key_idx = t / TILE_SIZE; const K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; + if (IS_RESERVED_KEY(find_key)) continue; int key_pos = -1; int src_lane = -1; diff --git a/include/merlin/core_kernels/find_or_insert.cuh b/include/merlin/core_kernels/find_or_insert.cuh index d0668841..654a230b 100644 --- a/include/merlin/core_kernels/find_or_insert.cuh +++ b/include/merlin/core_kernels/find_or_insert.cuh @@ -53,7 +53,7 @@ __global__ void tlp_v1_find_or_insert_kernel_with_io( key = keys[kv_idx]; score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -272,7 +272,7 @@ __global__ void tlp_v2_find_or_insert_kernel_with_io( key = keys[kv_idx]; score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -621,7 +621,7 @@ __global__ void pipeline_find_or_insert_kernel_with_io( S* sm_param_scores = SMM::param_scores(smem); __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S)); } - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -1326,7 +1326,7 @@ __global__ void find_or_insert_kernel_with_io( const K find_or_insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_or_insert_key)) continue; + if (IS_RESERVED_KEY(find_or_insert_key)) continue; const S find_or_insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); @@ -1463,7 +1463,7 @@ __global__ void find_or_insert_kernel_lock_key_hybrid( score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -1713,7 +1713,7 @@ __global__ void find_or_insert_kernel( const K find_or_insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_or_insert_key)) continue; + if (IS_RESERVED_KEY(find_or_insert_key)) continue; const S find_or_insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); diff --git a/include/merlin/core_kernels/find_ptr_or_insert.cuh b/include/merlin/core_kernels/find_ptr_or_insert.cuh index 2e5d8ff7..87472619 100644 --- a/include/merlin/core_kernels/find_ptr_or_insert.cuh +++ b/include/merlin/core_kernels/find_ptr_or_insert.cuh @@ -53,7 +53,7 @@ __global__ void find_or_insert_ptr_kernel_lock_key( key = keys[kv_idx]; score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -283,7 +283,7 @@ __global__ void find_ptr_or_insert_kernel( const K find_or_insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_or_insert_key)) continue; + if (IS_RESERVED_KEY(find_or_insert_key)) continue; const S find_or_insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); diff --git a/include/merlin/core_kernels/lookup.cuh b/include/merlin/core_kernels/lookup.cuh index 8310460f..9c443659 100644 --- a/include/merlin/core_kernels/lookup.cuh +++ b/include/merlin/core_kernels/lookup.cuh @@ -885,7 +885,7 @@ __global__ void lookup_kernel_with_io( int key_idx = t / TILE_SIZE; const K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; + if (IS_RESERVED_KEY(find_key)) continue; V* find_value = values + key_idx * dim; @@ -1015,7 +1015,7 @@ __device__ void tlp_lookup_kernel_hybrid_impl( if (kv_idx < n) { key = keys[kv_idx]; if (dst_offset) dst_offset[kv_idx] = kv_idx; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -1140,7 +1140,7 @@ __device__ void lookup_kernel_impl( int key_idx = t / TILE_SIZE; const K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; + if (IS_RESERVED_KEY(find_key)) continue; int key_pos = -1; int src_lane = -1; diff --git a/include/merlin/core_kernels/lookup_ptr.cuh b/include/merlin/core_kernels/lookup_ptr.cuh index cfa5ea56..ce57b0b9 100644 --- a/include/merlin/core_kernels/lookup_ptr.cuh +++ b/include/merlin/core_kernels/lookup_ptr.cuh @@ -41,7 +41,7 @@ __global__ void tlp_lookup_ptr_kernel_with_filter( uint32_t key_pos = {0}; if (kv_idx < n) { key = keys[kv_idx]; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -140,7 +140,7 @@ __global__ void lookup_ptr_kernel(const Table* __restrict table, int key_idx = t / TILE_SIZE; const K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; + if (IS_RESERVED_KEY(find_key)) continue; int key_pos = -1; int src_lane = -1; diff --git a/include/merlin/core_kernels/update.cuh b/include/merlin/core_kernels/update.cuh index 731e312b..69bcd250 100644 --- a/include/merlin/core_kernels/update.cuh +++ b/include/merlin/core_kernels/update.cuh @@ -44,7 +44,7 @@ __global__ void tlp_update_kernel_with_io( if (kv_idx < n) { key = keys[kv_idx]; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -680,7 +680,7 @@ __global__ void update_kernel_with_io( const K update_key = keys[key_idx]; - if (IS_RESERVED_KEY(update_key)) continue; + if (IS_RESERVED_KEY(update_key)) continue; const V* update_value = values + key_idx * dim; @@ -773,7 +773,7 @@ __global__ void tlp_update_kernel_hybrid( if (kv_idx < n) { key = keys[kv_idx]; if (src_offset) src_offset[kv_idx] = kv_idx; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -878,7 +878,7 @@ __global__ void update_kernel(const Table* __restrict table, const K update_key = keys[key_idx]; - if (IS_RESERVED_KEY(update_key)) continue; + if (IS_RESERVED_KEY(update_key)) continue; size_t bkt_idx = 0; size_t start_idx = 0; diff --git a/include/merlin/core_kernels/update_score.cuh b/include/merlin/core_kernels/update_score.cuh index fc24a745..9f4460c3 100644 --- a/include/merlin/core_kernels/update_score.cuh +++ b/include/merlin/core_kernels/update_score.cuh @@ -43,7 +43,7 @@ __global__ void tlp_update_score_kernel(Bucket* __restrict__ buckets, if (kv_idx < n) { key = keys[kv_idx]; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -549,7 +549,7 @@ __global__ void update_score_kernel(const Table* __restrict table, const K update_key = keys[key_idx]; - if (IS_RESERVED_KEY(update_key)) continue; + if (IS_RESERVED_KEY(update_key)) continue; size_t bkt_idx = 0; size_t start_idx = 0; diff --git a/include/merlin/core_kernels/update_values.cuh b/include/merlin/core_kernels/update_values.cuh index c2e3d17f..298d3bb9 100644 --- a/include/merlin/core_kernels/update_values.cuh +++ b/include/merlin/core_kernels/update_values.cuh @@ -42,7 +42,7 @@ __global__ void tlp_update_values_kernel_with_io( if (kv_idx < n) { key = keys[kv_idx]; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -638,7 +638,7 @@ __global__ void update_values_kernel_with_io( const K update_key = keys[key_idx]; - if (IS_RESERVED_KEY(update_key)) continue; + if (IS_RESERVED_KEY(update_key)) continue; const V* update_value = values + key_idx * dim; @@ -724,7 +724,7 @@ __global__ void tlp_update_values_kernel_hybrid( if (kv_idx < n) { key = keys[kv_idx]; if (src_offset) src_offset[kv_idx] = kv_idx; - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -823,7 +823,7 @@ __global__ void update_values_kernel(const Table* __restrict table, const K update_key = keys[key_idx]; - if (IS_RESERVED_KEY(update_key)) continue; + if (IS_RESERVED_KEY(update_key)) continue; size_t bkt_idx = 0; size_t start_idx = 0; diff --git a/include/merlin/core_kernels/upsert.cuh b/include/merlin/core_kernels/upsert.cuh index 37e505a1..c955c210 100644 --- a/include/merlin/core_kernels/upsert.cuh +++ b/include/merlin/core_kernels/upsert.cuh @@ -53,7 +53,7 @@ __global__ void tlp_v1_upsert_kernel_with_io( key = keys[kv_idx]; score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -268,7 +268,7 @@ __global__ void tlp_v2_upsert_kernel_with_io( key = keys[kv_idx]; score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -596,7 +596,7 @@ __global__ void pipeline_upsert_kernel_with_io( S* sm_param_scores = SMM::param_scores(smem); __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S)); } - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -1247,7 +1247,9 @@ __global__ void upsert_kernel_with_io_core( const K insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(insert_key)) continue; + if (IS_RESERVED_KEY(insert_key)) { + continue; + } const S insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); @@ -1280,7 +1282,9 @@ __global__ void upsert_kernel_with_io_core( occupy_result = g.shfl(occupy_result, src_lane); } while (occupy_result == OccupyResult::CONTINUE); - if (occupy_result == OccupyResult::REFUSED) continue; + if (occupy_result == OccupyResult::REFUSED) { + continue; + } if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && @@ -1377,7 +1381,7 @@ __global__ void upsert_kernel_lock_key_hybrid( } score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -1612,7 +1616,7 @@ __global__ void upsert_kernel(const Table* __restrict table, size_t key_idx = t / TILE_SIZE; const K insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(insert_key)) continue; + if (IS_RESERVED_KEY(insert_key)) continue; const S insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); diff --git a/include/merlin/core_kernels/upsert_and_evict.cuh b/include/merlin/core_kernels/upsert_and_evict.cuh index c685dca7..d43af108 100644 --- a/include/merlin/core_kernels/upsert_and_evict.cuh +++ b/include/merlin/core_kernels/upsert_and_evict.cuh @@ -56,7 +56,7 @@ __global__ void tlp_v1_upsert_and_evict_kernel_unique( if (kv_idx < n) { key = keys[kv_idx]; score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -285,7 +285,7 @@ __global__ void tlp_v2_upsert_and_evict_kernel_unique( if (kv_idx < n) { key = keys[kv_idx]; score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch); - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -639,7 +639,7 @@ __global__ void pipeline_upsert_and_evict_kernel_unique( S* sm_param_scores = SMM::param_scores(smem); __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S)); } - if (!IS_RESERVED_KEY(key)) { + if (!IS_RESERVED_KEY(key)) { const K hashed_key = Murmur3HashDevice(key); target_digests = digests_from_hashed(hashed_key); uint64_t global_idx = @@ -1397,7 +1397,7 @@ __global__ void upsert_and_evict_kernel_with_io_core( const K insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(insert_key)) continue; + if (IS_RESERVED_KEY(insert_key)) continue; const S insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); diff --git a/include/merlin/types.cuh b/include/merlin/types.cuh index 176aa208..c8ac799f 100644 --- a/include/merlin/types.cuh +++ b/include/merlin/types.cuh @@ -20,6 +20,7 @@ #include #include #include +#include "debug.hpp" namespace nv { namespace merlin { @@ -43,18 +44,70 @@ using byte = uint8_t; // Digest. using D = byte; +constexpr uint64_t DEFAULT_EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF); +constexpr uint64_t DEFAULT_RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE); +constexpr uint64_t DEFAULT_LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD); + +constexpr uint64_t DEFAULT_RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC); +constexpr uint64_t DEFAULT_VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE); -constexpr uint64_t EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF); -constexpr uint64_t RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE); -constexpr uint64_t VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE); -constexpr uint64_t LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD); -constexpr uint64_t RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC); constexpr uint64_t MAX_SCORE = UINT64_C(0xFFFFFFFFFFFFFFFF); constexpr uint64_t EMPTY_SCORE = UINT64_C(0); constexpr uint64_t IGNORED_GLOBAL_EPOCH = UINT64_C(0xFFFFFFFFFFFFFFFF); -#define IS_RESERVED_KEY(key) ((RESERVED_KEY_MASK & (key)) == RESERVED_KEY_MASK) -#define IS_VACANT_KEY(key) ((VACANT_KEY_MASK & (key)) == VACANT_KEY_MASK) +uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY; +__constant__ uint64_t EMPTY_KEY = DEFAULT_EMPTY_KEY; +__constant__ uint64_t RECLAIM_KEY = DEFAULT_RECLAIM_KEY; +__constant__ uint64_t LOCKED_KEY = DEFAULT_LOCKED_KEY; + +__constant__ uint64_t RESERVED_KEY_MASK_1 = DEFAULT_RESERVED_KEY_MASK; +__constant__ uint64_t RESERVED_KEY_MASK_2 = DEFAULT_RESERVED_KEY_MASK; +__constant__ uint64_t VACANT_KEY_MASK_1 = DEFAULT_VACANT_KEY_MASK; +__constant__ uint64_t VACANT_KEY_MASK_2 = DEFAULT_VACANT_KEY_MASK; + +constexpr int MAX_RESERVED_KEY_BIT = 62; + +template +__forceinline__ __device__ bool IS_RESERVED_KEY(K key) { + return (RESERVED_KEY_MASK_1 & key) == RESERVED_KEY_MASK_2; +} + +template +__forceinline__ __device__ bool IS_VACANT_KEY(K key) { + return (VACANT_KEY_MASK_1 & key) == VACANT_KEY_MASK_2; +} + +cudaError_t init_reserved_keys(int index) { + if (index < 1 || index > MAX_RESERVED_KEY_BIT) { + // index = 0 is the default, + // index = 62 is the maximum index can be set for reserved keys. + return cudaSuccess; + } + uint64_t reservedKeyMask1 = ~(UINT64_C(3) << index); + uint64_t reservedKeyMask2 = reservedKeyMask1 & ~UINT64_C(1); + uint64_t vacantKeyMask1 = ~(UINT64_C(1) << index); + uint64_t vacantKeyMask2 = vacantKeyMask1 & ~UINT64_C(1); + + uint64_t emptyKey = reservedKeyMask2 | (UINT64_C(3) << index); + uint64_t reclaimKey = vacantKeyMask2; + uint64_t lockedKey = emptyKey & ~(UINT64_C(2) << index); + EMPTY_KEY_CPU = emptyKey; + +// printf("reserved keys are emptyKey, reclaimKey, lockedKey and reservedKeyMask2\n"); +// printf("emptyKey: %lx, reclaimKey: %lx\n", emptyKey, reclaimKey); +// printf("lockedKey: %lx, reservedKeyMask1: %lx\n", lockedKey, reservedKeyMask1); +// printf("reservedKeyMask2: %lx, vacantKeyMask1: %lx\n", reservedKeyMask2, vacantKeyMask1); + + CUDA_CHECK(cudaMemcpyToSymbol(EMPTY_KEY, &emptyKey, sizeof(uint64_t))); + CUDA_CHECK(cudaMemcpyToSymbol(RECLAIM_KEY, &reclaimKey, sizeof(uint64_t))); + CUDA_CHECK(cudaMemcpyToSymbol(LOCKED_KEY, &lockedKey, sizeof(uint64_t))); + + CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_1, &reservedKeyMask1, sizeof(uint64_t))); + CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_2, &reservedKeyMask2, sizeof(uint64_t))); + CUDA_CHECK(cudaMemcpyToSymbol(VACANT_KEY_MASK_1, &vacantKeyMask1, sizeof(uint64_t))); + CUDA_CHECK(cudaMemcpyToSymbol(VACANT_KEY_MASK_2, &vacantKeyMask2, sizeof(uint64_t))); + return cudaGetLastError(); +} template using AtomicKey = cuda::atomic; diff --git a/include/merlin/utils.cuh b/include/merlin/utils.cuh index 316f12bf..09499152 100644 --- a/include/merlin/utils.cuh +++ b/include/merlin/utils.cuh @@ -314,6 +314,20 @@ inline void free_pointers(cudaStream_t stream, int n, ...) { va_end(args); } +__global__ void memset64bitKernel(void* devPtr, uint64_t value, size_t count) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + static_cast(devPtr)[idx] = value; + } +} + +__forceinline__ __host__ cudaError_t memset64Async(void* devPtr, uint64_t value, size_t count, cudaStream_t stream = 0) { + int blockSize = 256; + int numBlocks = (count + blockSize - 1) / blockSize; + memset64bitKernel<<>>(devPtr, value, count); + return cudaGetLastError(); +} + #define CUDA_FREE_POINTERS(stream, ...) \ nv::merlin::free_pointers( \ stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__); diff --git a/include/merlin_hashtable.cuh b/include/merlin_hashtable.cuh index 9586f16a..ba435007 100644 --- a/include/merlin_hashtable.cuh +++ b/include/merlin_hashtable.cuh @@ -98,6 +98,17 @@ struct HashTableOptions { int device_id = -1; ///< The ID of device. bool io_by_cpu = false; ///< The flag indicating if the CPU handles IO. bool use_constant_memory = false; ///< reserved + /* + * reserved_key_start_bit = 0, is the default behavior, HKV reserves `0xFFFFFFFFFFFFFFFD`, + * `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` for internal using. + * if the default one conflicted with your keys, change the reserved_key_start_bit + * value to a numbers between 1 and 62, reserved_key_start_bit = 1 means using the + * insignificant bits index 1 and 2 as the keys as the reserved keys and + * the index 0 bit is 0 and all the other bits are 1, the new reserved keys are + * `FFFFFFFFFFFFFFFE`, `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFF8`, and `0xFFFFFFFFFFFFFFFA` + * the console log prints the reserved keys during the table initialization. + */ + int reserved_key_start_bit = 0; ///< The binary index of reserved key. MemoryPoolOptions device_memory_pool; ///< Configuration options for device memory pool. MemoryPoolOptions @@ -865,6 +876,10 @@ class HashTable : public HashTableBase { return; } options_ = options; + MERLIN_CHECK(options.reserved_key_start_bit >= 0 && + options.reserved_key_start_bit <= MAX_RESERVED_KEY_BIT, + "options.reserved_key_start_bit should >= 0 and <= 62."); + CUDA_CHECK(init_reserved_keys(options.reserved_key_start_bit)); default_allocator_ = (allocator == nullptr); allocator_ = (allocator == nullptr) ? (new DefaultAllocator()) : allocator; @@ -1103,8 +1118,6 @@ class HashTable : public HashTableBase { * shape (n, DIM). * @param scores The scores to insert on GPU-accessible memory with shape * (n). - * @param scores The scores to insert on GPU-accessible memory with shape - * (n). * @params evicted_keys The output of keys replaced with minimum score. * @params evicted_values The output of values replaced with minimum score on * keys. @@ -1198,8 +1211,7 @@ class HashTable : public HashTableBase { size_type block_size = options_.block_size; size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size); - CUDA_CHECK(cudaMemsetAsync(evicted_keys, static_cast(EMPTY_KEY), - n * sizeof(K), stream)); + CUDA_CHECK(memset64Async(evicted_keys, EMPTY_KEY_CPU, n, stream)); using Selector = SelectUpsertAndEvictKernelWithIO; @@ -1236,8 +1248,6 @@ class HashTable : public HashTableBase { * shape (n, DIM). * @param scores The scores to insert on GPU-accessible memory with shape * (n). - * @param scores The scores to insert on GPU-accessible memory with shape - * (n). * @params evicted_keys The output of keys replaced with minimum score. * @params evicted_values The output of values replaced with minimum score on * keys. diff --git a/tests/accum_or_assign_test.cc.cu b/tests/accum_or_assign_test.cc.cu index 6182bde1..4f0d776f 100644 --- a/tests/accum_or_assign_test.cc.cu +++ b/tests/accum_or_assign_test.cc.cu @@ -52,7 +52,7 @@ struct ExportIfPredFunctor { } }; -void test_basic_when_full(size_t max_hbm_for_vectors) { +void test_basic_when_full(size_t max_hbm_for_vectors, int key_start) { constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL; @@ -65,7 +65,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -162,7 +162,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_erase_if_pred(size_t max_hbm_for_vectors) { +void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start) { constexpr uint64_t INIT_CAPACITY = 256UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 128UL; @@ -176,7 +176,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) { bool* h_accum_or_assigns; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -301,7 +301,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_rehash(size_t max_hbm_for_vectors) { +void test_rehash(size_t max_hbm_for_vectors, int key_start) { constexpr uint64_t BUCKET_MAX_SIZE = 128ul; constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE; constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY; @@ -316,7 +316,7 @@ void test_rehash(size_t max_hbm_for_vectors) { bool* h_accum_or_assigns; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -452,7 +452,7 @@ void test_rehash(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { +void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start) { constexpr uint64_t INIT_CAPACITY = 1024; constexpr uint64_t MAX_CAPACITY = 16 * 1024; constexpr uint64_t INIT_KEY_NUM = 1024; @@ -468,7 +468,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { float true_ratio = 0.6f; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -759,7 +759,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { // ASSERT_EQ(table->capacity(), MAX_CAPACITY); //} // -void test_export_batch_if(size_t max_hbm_for_vectors) { +void test_export_batch_if(size_t max_hbm_for_vectors, int key_start) { constexpr uint64_t INIT_CAPACITY = 256UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 128UL; @@ -773,7 +773,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) { size_t h_dump_counter = 0; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1140,7 +1140,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1152,7 +1152,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { constexpr float true_ratio = 0.5; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1364,7 +1364,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr int RSHIFT_ON_NANO = 20; constexpr uint64_t BUCKET_NUM = 8UL; @@ -1379,6 +1379,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1590,7 +1591,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1603,6 +1604,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1854,7 +1856,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1867,6 +1869,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2112,7 +2115,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -2126,6 +2129,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2365,7 +2369,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BATCH_SIZE = 1024 * 1024ul; constexpr uint64_t STEPS = 128; constexpr uint64_t MAX_BUCKET_SIZE = 128; @@ -2378,6 +2382,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2500,7 +2505,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { +void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_MAX_SIZE = 128ul; constexpr uint64_t INIT_CAPACITY = 4 * 1024 - BUCKET_MAX_SIZE - 1; constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY; @@ -2511,6 +2516,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2864,27 +2870,27 @@ void test_accum_or_assign_values_check(size_t max_hbm_for_vectors) { } TEST(AccumOrAssignTest, test_export_batch_if) { - test_export_batch_if(16); - test_export_batch_if(0); + test_export_batch_if(16, 22); + test_export_batch_if(0, 0); } TEST(AccumOrAssignTest, test_basic_when_full) { - test_basic_when_full(16); - test_basic_when_full(0); + test_basic_when_full(16, 2); + test_basic_when_full(0, 0); } TEST(AccumOrAssignTest, test_erase_if_pred) { - test_erase_if_pred(16); - test_erase_if_pred(0); + test_erase_if_pred(16, 0); + test_erase_if_pred(0, 5); } TEST(AccumOrAssignTest, test_rehash) { - test_rehash(16); - test_rehash(0); + test_rehash(16, 7); + test_rehash(0, 0); } TEST(AccumOrAssignTest, test_rehash_on_big_batch) { - test_rehash_on_big_batch(16); - test_rehash_on_big_batch(0); + test_rehash_on_big_batch(16, 9); + test_rehash_on_big_batch(0, 0); } TEST(AccumOrAssignTest, test_dynamic_rehash_on_multi_threads) { - test_dynamic_rehash_on_multi_threads(16); + test_dynamic_rehash_on_multi_threads(16, 56); test_dynamic_rehash_on_multi_threads(0); } TEST(AccumOrAssignTest, test_evict_strategy_lru_basic) { @@ -2892,32 +2898,32 @@ TEST(AccumOrAssignTest, test_evict_strategy_lru_basic) { test_evict_strategy_lru_basic(0); } TEST(AccumOrAssignTest, test_evict_strategy_lfu_basic) { - test_evict_strategy_lfu_basic(16); + test_evict_strategy_lfu_basic(16, 3); // TODO(rhdong): Add back when diff error issue fixed in hybrid mode. // test_evict_strategy_lfu_basic(0); } TEST(AccumOrAssignTest, test_evict_strategy_epochlru_basic) { - test_evict_strategy_epochlru_basic(16); + test_evict_strategy_epochlru_basic(16, 33); test_evict_strategy_epochlru_basic(0); } TEST(AccumOrAssignTest, test_evict_strategy_epochlfu_basic) { test_evict_strategy_epochlfu_basic(16); - test_evict_strategy_epochlfu_basic(0); + test_evict_strategy_epochlfu_basic(0, 44); } TEST(AccumOrAssignTest, test_evict_strategy_customized_basic) { test_evict_strategy_customized_basic(16); - test_evict_strategy_customized_basic(0); + test_evict_strategy_customized_basic(0, 23); } TEST(AccumOrAssignTest, test_evict_strategy_customized_advanced) { - test_evict_strategy_customized_advanced(16); + test_evict_strategy_customized_advanced(16, 16); test_evict_strategy_customized_advanced(0); } TEST(AccumOrAssignTest, test_evict_strategy_customized_correct_rate) { // TODO(rhdong): after blossom CI issue is resolved, the skip logic. const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI")); - test_evict_strategy_customized_correct_rate(16); + test_evict_strategy_customized_correct_rate(16, 61); if (!skip_hmem_check) { test_evict_strategy_customized_correct_rate(0); } else { diff --git a/tests/assign_score_test.cc.cu b/tests/assign_score_test.cc.cu index abde3924..87241158 100644 --- a/tests/assign_score_test.cc.cu +++ b/tests/assign_score_test.cc.cu @@ -38,7 +38,7 @@ using S = uint64_t; using EvictStrategy = nv::merlin::EvictStrategy; using TableOptions = nv::merlin::HashTableOptions; -void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -50,6 +50,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -217,7 +218,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -229,6 +230,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -392,7 +394,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr int RSHIFT_ON_NANO = 20; constexpr uint64_t BUCKET_NUM = 8UL; @@ -406,6 +408,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -580,7 +583,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -592,6 +595,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -792,7 +796,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -804,6 +808,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -962,7 +967,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -974,6 +979,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1500,7 +1506,7 @@ void test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) { } } -void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BATCH_SIZE = 1024 * 1024ul; constexpr uint64_t STEPS = 128; constexpr uint64_t MAX_BUCKET_SIZE = 128; @@ -1512,6 +1518,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1776,26 +1783,26 @@ void test_find_or_insert_values_check(size_t max_hbm_for_vectors) { TEST(AssignScoreTest, test_evict_strategy_lru_basic) { test_evict_strategy_lru_basic(16); - test_evict_strategy_lru_basic(0); + test_evict_strategy_lru_basic(0, 34); } TEST(AssignScoreTest, test_evict_strategy_lfu_basic) { test_evict_strategy_lfu_basic(16); - test_evict_strategy_lfu_basic(0); + test_evict_strategy_lfu_basic(0, 2); } TEST(AssignScoreTest, test_evict_strategy_epochlru_basic) { - test_evict_strategy_epochlru_basic(16); + test_evict_strategy_epochlru_basic(16, 51); test_evict_strategy_epochlru_basic(0); } TEST(AssignScoreTest, test_evict_strategy_epochlfu_basic) { - test_evict_strategy_epochlfu_basic(16); + test_evict_strategy_epochlfu_basic(16, 4); test_evict_strategy_epochlfu_basic(0); } TEST(AssignScoreTest, test_evict_strategy_customized_basic) { test_evict_strategy_customized_basic(16); - test_evict_strategy_customized_basic(0); + test_evict_strategy_customized_basic(0, 11); } TEST(AssignScoreTest, test_evict_strategy_customized_advanced) { - test_evict_strategy_customized_advanced(16); + test_evict_strategy_customized_advanced(16, 33); test_evict_strategy_customized_advanced(0); } TEST(AssignScoreTest, test_assign_advanced_on_epochlfu) { @@ -1804,7 +1811,7 @@ TEST(AssignScoreTest, test_assign_advanced_on_epochlfu) { TEST(AssignScoreTest, test_evict_strategy_customized_correct_rate) { // TODO(rhdong): after blossom CI issue is resolved, the skip logic. const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI")); - test_evict_strategy_customized_correct_rate(16); + test_evict_strategy_customized_correct_rate(16, 44); if (!skip_hmem_check) { test_evict_strategy_customized_correct_rate(0); } else { diff --git a/tests/assign_values_test.cc.cu b/tests/assign_values_test.cc.cu index c721addf..07eaf1e4 100644 --- a/tests/assign_values_test.cc.cu +++ b/tests/assign_values_test.cc.cu @@ -37,7 +37,7 @@ using S = uint64_t; using EvictStrategy = nv::merlin::EvictStrategy; using TableOptions = nv::merlin::HashTableOptions; -void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -49,6 +49,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -202,7 +203,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start = 0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -214,6 +215,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -735,12 +737,12 @@ void test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) { } TEST(AssignValuesTest, test_evict_strategy_lru_basic) { - test_evict_strategy_lru_basic(16); + test_evict_strategy_lru_basic(16, 21); test_evict_strategy_lru_basic(0); } TEST(AssignValuesTest, test_evict_strategy_epochlfu_basic) { test_evict_strategy_epochlfu_basic(16); - test_evict_strategy_epochlfu_basic(0); + test_evict_strategy_epochlfu_basic(0, 8); } TEST(AssignValuesTest, test_assign_advanced_on_epochlfu) { test_assign_advanced_on_epochlfu(16); diff --git a/tests/find_or_insert_ptr_test.cc.cu b/tests/find_or_insert_ptr_test.cc.cu index 1588a699..f82b18aa 100644 --- a/tests/find_or_insert_ptr_test.cc.cu +++ b/tests/find_or_insert_ptr_test.cc.cu @@ -56,7 +56,7 @@ struct ExportIfPredFunctor { } }; -void test_basic(size_t max_hbm_for_vectors) { +void test_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL; @@ -69,6 +69,7 @@ void test_basic(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -331,7 +332,7 @@ void test_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_basic_when_full(size_t max_hbm_for_vectors) { +void test_basic_when_full(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL; @@ -344,6 +345,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) { TableOptions options; + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -463,7 +465,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_erase_if_pred(size_t max_hbm_for_vectors) { +void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 256UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 128UL; @@ -476,7 +478,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -614,7 +616,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_rehash(size_t max_hbm_for_vectors) { +void test_rehash(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_MAX_SIZE = 128ul; constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE; constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY; @@ -626,7 +628,7 @@ void test_rehash(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -771,7 +773,7 @@ void test_rehash(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { +void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 1024; constexpr uint64_t MAX_CAPACITY = 16 * 1024; constexpr uint64_t INIT_KEY_NUM = 1024; @@ -782,7 +784,6 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -945,7 +946,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { +void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_MAX_SIZE = 128ul; constexpr uint64_t INIT_CAPACITY = 4 * 1024; constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY; @@ -955,7 +956,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { std::vector threads; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1089,7 +1090,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { ASSERT_EQ(table->capacity(), MAX_CAPACITY); } -void test_export_batch_if(size_t max_hbm_for_vectors) { +void test_export_batch_if(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 256UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 128UL; @@ -1102,7 +1103,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) { size_t h_dump_counter = 0; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1286,7 +1287,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_basic_for_cpu_io() { +void test_basic_for_cpu_io(int key_start=0) { constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL; @@ -1298,7 +1299,7 @@ void test_basic_for_cpu_io() { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1464,7 +1465,7 @@ void test_basic_for_cpu_io() { CudaCheckError(); } -void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1475,7 +1476,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1673,7 +1674,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1684,7 +1685,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1872,7 +1873,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr int RSHIFT_ON_NANO = 20; constexpr uint64_t BUCKET_NUM = 8UL; @@ -1885,7 +1886,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2092,7 +2093,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -2103,7 +2104,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2333,7 +2334,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -2344,7 +2345,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2532,7 +2533,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -2543,7 +2544,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 256; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2760,7 +2761,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BATCH_SIZE = 1024 * 1024ul; constexpr uint64_t STEPS = 128; constexpr uint64_t MAX_BUCKET_SIZE = 128; @@ -2771,7 +2772,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { const int rounds = 12; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -3443,7 +3444,7 @@ void test_find_or_insert_values_check(size_t max_hbm_for_vectors) { } } -void test_duplicated_keys(size_t max_hbm_for_vectors) { +void test_duplicated_keys(size_t max_hbm_for_vectors, size_t key_start = 0) { constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 1024UL; @@ -3455,7 +3456,7 @@ void test_duplicated_keys(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -3542,7 +3543,7 @@ void test_duplicated_keys(size_t max_hbm_for_vectors) { TEST(FindOrInsertPtrTest, test_export_batch_if) { test_export_batch_if(16); - test_export_batch_if(0); + test_export_batch_if(0, 33); } TEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) { test_find_or_insert_multi_threads(16, 0.25f, 0.125f); @@ -3551,58 +3552,61 @@ TEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) { test_find_or_insert_multi_threads(0, 0.375f, 0.125f); } TEST(FindOrInsertPtrTest, test_basic) { - test_basic(16); + test_basic(16, 3); test_basic(0); } TEST(FindOrInsertPtrTest, test_basic_when_full) { - test_basic_when_full(16); + test_basic_when_full(16, 4); test_basic_when_full(0); } TEST(FindOrInsertPtrTest, test_erase_if_pred) { test_erase_if_pred(16); - test_erase_if_pred(0); + test_erase_if_pred(0, 18); } TEST(FindOrInsertPtrTest, test_rehash) { test_rehash(16); - test_rehash(0); + test_rehash(0, 44); } TEST(FindOrInsertPtrTest, test_rehash_on_big_batch) { - test_rehash_on_big_batch(16); + test_rehash_on_big_batch(16,23); test_rehash_on_big_batch(0); } TEST(FindOrInsertPtrTest, test_dynamic_rehash_on_multi_threads) { test_dynamic_rehash_on_multi_threads(16); - test_dynamic_rehash_on_multi_threads(0); + test_dynamic_rehash_on_multi_threads(0, 19); +} +TEST(FindOrInsertPtrTest, test_basic_for_cpu_io) { + test_basic_for_cpu_io(); + test_basic_for_cpu_io(52); } -TEST(FindOrInsertPtrTest, test_basic_for_cpu_io) { test_basic_for_cpu_io(); } TEST(FindOrInsertPtrTest, test_evict_strategy_lru_basic) { test_evict_strategy_lru_basic(16); - test_evict_strategy_lru_basic(0); + test_evict_strategy_lru_basic(0, 18); } TEST(FindOrInsertPtrTest, test_evict_strategy_lfu_basic) { - test_evict_strategy_lfu_basic(16); + test_evict_strategy_lfu_basic(16, 29); test_evict_strategy_lfu_basic(0); } TEST(FindOrInsertPtrTest, test_evict_strategy_epochlru_basic) { - test_evict_strategy_epochlru_basic(16); + test_evict_strategy_epochlru_basic(16, 45); test_evict_strategy_epochlru_basic(0); } TEST(FindOrInsertPtrTest, test_evict_strategy_epochlfu_basic) { test_evict_strategy_epochlfu_basic(16); - test_evict_strategy_epochlfu_basic(0); + test_evict_strategy_epochlfu_basic(0, 59); } TEST(FindOrInsertPtrTest, test_evict_strategy_customized_basic) { - test_evict_strategy_customized_basic(16); + test_evict_strategy_customized_basic(16, 38); test_evict_strategy_customized_basic(0); } TEST(FindOrInsertPtrTest, test_evict_strategy_customized_advanced) { test_evict_strategy_customized_advanced(16); - test_evict_strategy_customized_advanced(0); + test_evict_strategy_customized_advanced(0, 25); } TEST(FindOrInsertPtrTest, test_evict_strategy_customized_correct_rate) { // TODO(rhdong): after blossom CI issue is resolved, the skip logic. const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI")); - test_evict_strategy_customized_correct_rate(16); + test_evict_strategy_customized_correct_rate(16, 16); if (!skip_hmem_check) { test_evict_strategy_customized_correct_rate(0); } else { @@ -3615,6 +3619,6 @@ TEST(FindOrInsertPtrTest, test_find_or_insert_values_check) { test_find_or_insert_values_check(0); } TEST(FindOrInsertPtrTest, test_duplicated_keys) { - test_duplicated_keys(16); + test_duplicated_keys(16, 39); test_duplicated_keys(0); } diff --git a/tests/find_or_insert_test.cc.cu b/tests/find_or_insert_test.cc.cu index 9ee01cb8..1aa96d7a 100644 --- a/tests/find_or_insert_test.cc.cu +++ b/tests/find_or_insert_test.cc.cu @@ -56,7 +56,7 @@ struct ExportIfPredFunctor { } }; -void test_basic(size_t max_hbm_for_vectors) { +void test_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL; @@ -68,7 +68,7 @@ void test_basic(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -300,7 +300,7 @@ void test_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_basic_when_full(size_t max_hbm_for_vectors) { +void test_basic_when_full(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = INIT_CAPACITY; @@ -312,7 +312,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -423,7 +423,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_erase_if_pred(size_t max_hbm_for_vectors) { +void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 256UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 128UL; @@ -436,7 +436,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -559,7 +559,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_rehash(size_t max_hbm_for_vectors) { +void test_rehash(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_MAX_SIZE = 128ul; constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE; constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY; @@ -571,7 +571,7 @@ void test_rehash(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -701,7 +701,7 @@ void test_rehash(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { +void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 1024; constexpr uint64_t MAX_CAPACITY = 16 * 1024; constexpr uint64_t INIT_KEY_NUM = 1024; @@ -712,7 +712,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -845,7 +845,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { +void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_MAX_SIZE = 128ul; constexpr uint64_t INIT_CAPACITY = 4 * 1024; constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY; @@ -855,7 +855,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { std::vector threads; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -974,7 +974,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { ASSERT_EQ(table->capacity(), MAX_CAPACITY); } -void test_export_batch_if(size_t max_hbm_for_vectors) { +void test_export_batch_if(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t INIT_CAPACITY = 256UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 128UL; @@ -987,7 +987,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) { size_t h_dump_counter = 0; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1156,7 +1156,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_basic_for_cpu_io() { +void test_basic_for_cpu_io(int key_start=0) { constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL; constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY; constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL; @@ -1168,7 +1168,7 @@ void test_basic_for_cpu_io() { bool* h_found; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1305,7 +1305,7 @@ void test_basic_for_cpu_io() { CudaCheckError(); } -void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1316,7 +1316,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1482,7 +1482,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1493,7 +1493,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1655,7 +1655,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr int RSHIFT_ON_NANO = 20; constexpr uint64_t BUCKET_NUM = 8UL; @@ -1668,7 +1668,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -1843,7 +1843,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -1854,7 +1854,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2056,7 +2056,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -2067,7 +2067,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 128; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -2227,7 +2227,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { CudaCheckError(); } -void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { +void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors, int key_start=0) { constexpr uint64_t BUCKET_NUM = 8UL; constexpr uint64_t BUCKET_MAX_SIZE = 128UL; constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE; // 1024UL; @@ -2238,7 +2238,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { constexpr uint64_t TEST_TIMES = 256; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -3594,7 +3594,7 @@ void test_find_or_insert_values_check(size_t max_hbm_for_vectors) { TEST(FindOrInsertTest, test_export_batch_if) { test_export_batch_if(16); - test_export_batch_if(0); + test_export_batch_if(0, 31); } TEST(FindOrInsertTest, test_find_or_insert_multi_threads) { test_find_or_insert_multi_threads(16, 0.25f, 0.125f); @@ -3623,52 +3623,55 @@ TEST(FindOrInsertTest, test_value_type_hbm_mode) { test_value_type_hbm_mode(); } TEST(FindOrInsertTest, test_basic) { - test_basic(16); + test_basic(16, 61); test_basic(0); } TEST(FindOrInsertTest, test_basic_when_full) { test_basic_when_full(16); - test_basic_when_full(0); + test_basic_when_full(0, 41); } TEST(FindOrInsertTest, test_erase_if_pred) { test_erase_if_pred(16); - test_erase_if_pred(0); + test_erase_if_pred(0, 17); } TEST(FindOrInsertTest, test_rehash) { test_rehash(16); - test_rehash(0); + test_rehash(0, 22); } TEST(FindOrInsertTest, test_rehash_on_big_batch) { - test_rehash_on_big_batch(16); + test_rehash_on_big_batch(16, 37); test_rehash_on_big_batch(0); } TEST(FindOrInsertTest, test_dynamic_rehash_on_multi_threads) { - test_dynamic_rehash_on_multi_threads(16); + test_dynamic_rehash_on_multi_threads(16, 22); test_dynamic_rehash_on_multi_threads(0); } -TEST(FindOrInsertTest, test_basic_for_cpu_io) { test_basic_for_cpu_io(); } +TEST(FindOrInsertTest, test_basic_for_cpu_io) { + test_basic_for_cpu_io(45); + test_basic_for_cpu_io(); +} TEST(FindOrInsertTest, test_evict_strategy_lru_basic) { test_evict_strategy_lru_basic(16); - test_evict_strategy_lru_basic(0); + test_evict_strategy_lru_basic(0, 44); } TEST(FindOrInsertTest, test_evict_strategy_lfu_basic) { - test_evict_strategy_lfu_basic(16); + test_evict_strategy_lfu_basic(16, 34); test_evict_strategy_lfu_basic(0); } TEST(FindOrInsertTest, test_evict_strategy_epochlru_basic) { - test_evict_strategy_epochlru_basic(16); + test_evict_strategy_epochlru_basic(16, 41); test_evict_strategy_epochlru_basic(0); } TEST(FindOrInsertTest, test_evict_strategy_epochlfu_basic) { - test_evict_strategy_epochlfu_basic(16); + test_evict_strategy_epochlfu_basic(16, 42); test_evict_strategy_epochlfu_basic(0); } TEST(FindOrInsertTest, test_evict_strategy_customized_basic) { test_evict_strategy_customized_basic(16); - test_evict_strategy_customized_basic(0); + test_evict_strategy_customized_basic(0, 43); } TEST(FindOrInsertTest, test_evict_strategy_customized_advanced) { - test_evict_strategy_customized_advanced(16); + test_evict_strategy_customized_advanced(16, 54); test_evict_strategy_customized_advanced(0); } TEST(FindOrInsertTest, test_assign_advanced_on_epochlfu) { diff --git a/tests/find_with_missed_keys_test.cc.cu b/tests/find_with_missed_keys_test.cc.cu index 7b93761c..29b6e22f 100644 --- a/tests/find_with_missed_keys_test.cc.cu +++ b/tests/find_with_missed_keys_test.cc.cu @@ -30,7 +30,7 @@ using EvictStrategy = nv::merlin::EvictStrategy; using TableOptions = nv::merlin::HashTableOptions; void test_find(size_t max_hbm_for_vectors, size_t max_bucket_size, - double load_factor, bool pipeline_lookup) { + double load_factor, bool pipeline_lookup, int key_start = 0) { MERLIN_CHECK(load_factor >= 0.0 && load_factor <= 1.0, "Invalid `load_factor`"); @@ -46,7 +46,7 @@ void test_find(size_t max_hbm_for_vectors, size_t max_bucket_size, int* h_missed_indices; TableOptions options; - + options.reserved_key_start_bit = key_start; options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; @@ -157,14 +157,14 @@ void test_find(size_t max_hbm_for_vectors, size_t max_bucket_size, TEST(FindTest, test_find_when_empty) { // pure HMEM - test_find(0, 128, 0.0, true); + test_find(0, 128, 0.0, true, 12); test_find(0, 256, 0.0, false); // hybrid - test_find(32, 128, 0.0, true); + test_find(32, 128, 0.0, true, 58); test_find(32, 256, 0.0, false); // pure HBM test_find(1024, 128, 0.0, true); - test_find(1024, 256, 0.0, false); + test_find(1024, 256, 0.0, false, 12); } TEST(FindTest, test_find_when_full) { @@ -173,7 +173,7 @@ TEST(FindTest, test_find_when_full) { test_find(0, 256, 1.0, false); // hybrid test_find(32, 128, 1.0, true); - test_find(32, 256, 1.0, false); + test_find(32, 256, 1.0, false, 60); // pure HBM test_find(1024, 128, 1.0, true); test_find(1024, 256, 1.0, false); @@ -181,32 +181,33 @@ TEST(FindTest, test_find_when_full) { TEST(FindTest, test_find_load_factor) { // pure HMEM - test_find(0, 128, 0.2, true); - test_find(0, 256, 0.2, false); + test_find(0, 128, 0.2, true, 45); + test_find(0, 256, 0.2, false, 12); // hybrid - test_find(32, 128, 0.2, true); - test_find(32, 256, 0.2, false); + test_find(32, 128, 0.2, true, 27); + test_find(32, 256, 0.2, false, 53); // pure HBM - test_find(1024, 128, 0.2, true); - test_find(1024, 256, 0.2, false); + test_find(1024, 128, 0.2, true, 9); + test_find(1024, 256, 0.2, false, 38); // pure HMEM - test_find(0, 128, 0.5, true); - test_find(0, 256, 0.5, false); + test_find(0, 128, 0.5, true, 21); + test_find(0, 256, 0.5, false, 46); // hybrid - test_find(32, 128, 0.5, true); - test_find(32, 256, 0.5, false); + test_find(32, 128, 0.5, true, 31); + test_find(32, 256, 0.5, false, 59); // pure HBM - test_find(1024, 128, 0.5, true); - test_find(1024, 256, 0.5, false); + test_find(1024, 128, 0.5, true, 4); + test_find(1024, 256, 0.5, false, 22); // pure HMEM - test_find(0, 128, 0.75, true); - test_find(0, 256, 0.75, false); + test_find(0, 128, 0.75, true, 11); + test_find(0, 256, 0.75, false, 34); // hybrid - test_find(32, 128, 0.75, true); - test_find(32, 256, 0.75, false); + test_find(32, 128, 0.75, true, 18); + test_find(32, 256, 0.75, false, 47); // pure HBM - test_find(1024, 128, 0.75, true); - test_find(1024, 256, 0.75, false); + test_find(1024, 128, 0.75, true, 7); + test_find(1024, 256, 0.75, false, 29); + } diff --git a/tests/merlin_hashtable_test.cc.cu b/tests/merlin_hashtable_test.cc.cu index ed80be3c..9492592c 100644 --- a/tests/merlin_hashtable_test.cc.cu +++ b/tests/merlin_hashtable_test.cc.cu @@ -145,6 +145,8 @@ void test_basic(size_t max_hbm_for_vectors) { options.dim = DIM; options.max_bucket_size = BUCKET_MAX_SIZE; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); + options.reserved_key_start_bit = 2; + using Table = nv::merlin::HashTable; CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K))); @@ -512,6 +514,7 @@ void test_basic_when_full(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 3; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -641,6 +644,7 @@ void test_erase_if_pred(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 4; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -775,6 +779,7 @@ void test_rehash(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 5; options.max_bucket_size = BUCKET_MAX_SIZE; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -916,6 +921,7 @@ void test_rehash_on_big_batch(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 6; options.max_bucket_size = 128; options.max_load_factor = 0.6; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); @@ -1059,6 +1065,7 @@ void test_rehash_on_big_batch_specific(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 7; options.max_bucket_size = 128; options.max_load_factor = 0.6; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); @@ -1129,6 +1136,7 @@ void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 8; options.max_load_factor = 0.50f; options.max_bucket_size = BUCKET_MAX_SIZE; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); @@ -1264,6 +1272,7 @@ void test_export_batch_if(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 9; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -1447,6 +1456,7 @@ void test_basic_for_cpu_io() { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 10; options.max_hbm_for_vectors = nv::merlin::GB(0); options.io_by_cpu = true; using Table = nv::merlin::HashTable; @@ -1595,6 +1605,7 @@ void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 11; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -1771,6 +1782,7 @@ void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 12; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -1948,6 +1960,7 @@ void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 13; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -2133,6 +2146,7 @@ void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 14; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -2344,6 +2358,7 @@ void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 15; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -2513,6 +2528,7 @@ void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 16; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -2712,6 +2728,7 @@ void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) { options.init_capacity = INIT_CAPACITY; options.max_capacity = MAX_CAPACITY; options.dim = DIM; + options.reserved_key_start_bit = 17; options.max_bucket_size = MAX_BUCKET_SIZE; options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors); using Table = nv::merlin::HashTable; @@ -3327,6 +3344,7 @@ void test_bucket_size(bool load_scores = true) { options.max_capacity = MAX_CAPACITY; options.dim = DIM; options.max_hbm_for_vectors = nv::merlin::GB(16); + options.reserved_key_start_bit = 1; using Table = nv::merlin::HashTable; CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K))); diff --git a/tests/reserved_keys_test.cc.cu b/tests/reserved_keys_test.cc.cu new file mode 100644 index 00000000..a69589c8 --- /dev/null +++ b/tests/reserved_keys_test.cc.cu @@ -0,0 +1,104 @@ +/* +* Copyright (c) 2024, NVIDIA CORPORATION. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include "merlin/types.cuh" +#include "test_util.cuh" +#include "merlin/utils.cuh" + + +using namespace nv::merlin; + +__global__ void testReservedKeysKernel(uint64_t* keys, bool* results, size_t numKeys) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < numKeys) { + results[idx] = IS_RESERVED_KEY(keys[idx]); + } +} + +void testCustomMemsetAsync() { + size_t numElements = 4; + uint64_t value = 0xFFFFFFFFFFFFFFF1; + uint64_t* devPtr; + uint64_t* hostData = new uint64_t[numElements]; + + cudaMalloc((void**)&devPtr, numElements * sizeof(uint64_t)); + memset64Async(devPtr, value, numElements); + cudaMemcpy(hostData, devPtr, numElements * sizeof(uint64_t), cudaMemcpyDeviceToHost); + for (size_t i = 0; i < numElements; i++) { + assert(hostData[i] == value); + } + + std::cout << "All values were set correctly!" << std::endl; + + cudaFree(devPtr); + delete[] hostData; +} + +void testReservedKeys(uint64_t* testKeys, bool* expectedResults, size_t numKeys) { + uint64_t* d_keys; + bool* d_results; + bool* h_results = new bool[numKeys]; + + cudaMalloc(&d_keys, numKeys * sizeof(uint64_t)); + cudaMalloc(&d_results, numKeys * sizeof(bool)); + + cudaMemcpy(d_keys, testKeys, numKeys * sizeof(uint64_t), cudaMemcpyHostToDevice); + + int blockSize = 256; + int numBlocks = (numKeys + blockSize - 1) / blockSize; + + testReservedKeysKernel<<>>(d_keys, d_results, numKeys); + cudaDeviceSynchronize(); + + cudaMemcpy(h_results, d_results, numKeys * sizeof(bool), cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < numKeys; i++) { + assert(h_results[i] == expectedResults[i]); + } + + cudaFree(d_keys); + cudaFree(d_results); + delete[] h_results; + CudaCheckError(); + std::cout << "All tests passed." << std::endl; +} + +void testKeyOptions() { + for (int i = 0; i <= MAX_RESERVED_KEY_BIT; i++) { + CUDA_CHECK(init_reserved_keys(i)); + uint64_t host_reclaim_key, host_locked_key; + cudaMemcpyFromSymbol(&host_reclaim_key, RECLAIM_KEY, sizeof(uint64_t)); + cudaMemcpyFromSymbol(&host_locked_key, LOCKED_KEY, sizeof(uint64_t)); + + uint64_t testKeys[6] = { + EMPTY_KEY_CPU, host_reclaim_key, host_locked_key, + UINT64_C(0x0), UINT64_C(0x10), + DEFAULT_EMPTY_KEY + }; + bool expectedResults[6] = { + true, true, true, false, false, + (i == 0)? true : false + }; + testReservedKeys(testKeys, expectedResults, 4); + } +} + +TEST(ReservedKeysTest, testKeyOptions) { + testKeyOptions(); + testCustomMemsetAsync(); +} \ No newline at end of file