Skip to content

Commit

Permalink
opt(insert-and-evict): thrust prefix_sum introduce cudaMalloc/cudaFre…
Browse files Browse the repository at this point in the history
…e which make host wait. Replace it by cub API.
  • Loading branch information
Lifann committed Dec 12, 2023
1 parent 770be38 commit e728dd2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
14 changes: 5 additions & 9 deletions include/merlin/array_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <cooperative_groups.h>
#include "cub/cub.cuh"
#include "cuda_runtime.h"
#include "thrust/device_vector.h"
#include "thrust/execution_policy.h"
Expand Down Expand Up @@ -104,18 +105,13 @@ template <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>
void gpu_boolean_mask(size_t grid_size, size_t block_size, const bool* masks,
size_t n, size_t* n_evicted, Tidx* offsets,
K* __restrict keys, V* __restrict values,
S* __restrict scores, size_t dim, cudaStream_t stream) {
S* __restrict scores, Tidx* offset_ws,
size_t offset_ws_bytes, size_t dim, cudaStream_t stream) {
size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
gpu_cell_count<Tidx, TILE_SIZE>
<<<grid_size, block_size, 0, stream>>>(masks, offsets, n, n_evicted);
#if THRUST_VERSION >= 101600
auto policy = thrust::cuda::par_nosync.on(stream);
#else
auto policy = thrust::cuda::par.on(stream);
#endif
thrust::device_ptr<Tidx> d_src(offsets);
thrust::device_ptr<Tidx> d_dest(offsets);
thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
cub::DeviceScan::ExclusiveSum(offset_ws, offset_ws_bytes, offsets, offsets,
n_offsets, stream);
gpu_select_kvm_kernel<K, V, S, Tidx, TILE_SIZE>
<<<grid_size, block_size, 0, stream>>>(masks, n, offsets, keys, values,
scores, dim);
Expand Down
13 changes: 12 additions & 1 deletion include/merlin_hashtable.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <mutex>
#include <shared_mutex>
#include <type_traits>
#include "cub/cub.cuh"
#include "merlin/allocator.cuh"
#include "merlin/array_kernels.cuh"
#include "merlin/core_kernels.cuh"
Expand Down Expand Up @@ -598,9 +599,19 @@ class HashTable {

keys_not_empty<K>
<<<grid_size, block_size, 0, stream>>>(evicted_keys, d_masks, n);

void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
d_offsets, d_offsets, n_offsets, stream);
auto helper_ws{
dev_mem_pool_->get_workspace<1>(temp_storage_bytes, stream)};
int64_t* d_temp_storage_i64 = helper_ws.get<int64_t*>(0);

gpu_boolean_mask<K, V, S, int64_t, TILE_SIZE>(
grid_size, block_size, d_masks, n, d_evicted_counter, d_offsets,
evicted_keys, evicted_values, evicted_scores, dim(), stream);
evicted_keys, evicted_values, evicted_scores, d_temp_storage_i64,
temp_storage_bytes, dim(), stream);
}
return;
}
Expand Down

0 comments on commit e728dd2

Please sign in to comment.