Skip to content

Commit

Permalink
Using GPU rmm pool for CCSD(T)
Browse files Browse the repository at this point in the history
  • Loading branch information
abagusetty committed Jun 17, 2024
1 parent 0b2d076 commit 88d1ada
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 106 deletions.
1 change: 0 additions & 1 deletion exachem/cc/ccsd_t/ccsd_t.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ include(TargetMacros)
set(CCSD_T_SRCDIR ${CMAKE_CURRENT_SOURCE_DIR}/../exachem/cc/ccsd_t)
set(CCSD_T_COMMON_SRCS
${CCSD_T_SRCDIR}/ccsd_t.cpp
${CCSD_T_SRCDIR}/memory.cpp
${CCSD_T_SRCDIR}/ccsd_t_common.hpp
${CCSD_T_SRCDIR}/hybrid.cpp
${CCSD_T_SRCDIR}/ccsd_t_fused_driver.hpp
Expand Down
2 changes: 1 addition & 1 deletion exachem/cc/ccsd_t/ccsd_t.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ void exachem::cc::ccsd_t::ccsd_t_driver(ExecutionContext& ec, ChemEnv& chem_env)
// Given the singleton pool created by the TAMM is not used by the (T) kernel calculation.
// We artifically destroy the pool
tamm::reset_rmm_pool();
// tamm::reinitialize_rmm_pool();
tamm::reinitialize_rmm_pool();

std::string dev_str = "[CPU]";
#if defined(USE_CUDA)
Expand Down
25 changes: 13 additions & 12 deletions exachem/cc/ccsd_t/ccsd_t_all_fused.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,13 @@ void ccsd_t_fully_fused_none_df_none_task(
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
if(!gpuEventQuery(*done_compute)) { gpuEventSynchronize(*done_compute); }

T* dev_evl_sorted_h1b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h1b));
T* dev_evl_sorted_h2b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h2b));
T* dev_evl_sorted_h3b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h3b));
T* dev_evl_sorted_p4b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p4b));
T* dev_evl_sorted_p5b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p5b));
T* dev_evl_sorted_p6b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p6b));
auto& memDevPool = RMMMemoryManager::getInstance().getDeviceMemoryPool();
T* dev_evl_sorted_h1b = static_cast<T*>(memDevPool.allocate(sizeof(T) * base_size_h1b));
T* dev_evl_sorted_h2b = static_cast<T*>(memDevPool.allocate(sizeof(T) * base_size_h2b));
T* dev_evl_sorted_h3b = static_cast<T*>(memDevPool.allocate(sizeof(T) * base_size_h3b));
T* dev_evl_sorted_p4b = static_cast<T*>(memDevPool.allocate(sizeof(T) * base_size_p4b));
T* dev_evl_sorted_p5b = static_cast<T*>(memDevPool.allocate(sizeof(T) * base_size_p5b));
T* dev_evl_sorted_p6b = static_cast<T*>(memDevPool.allocate(sizeof(T) * base_size_p6b));

gpuMemcpyAsync<T>(dev_evl_sorted_h1b, host_evl_sorted_h1b, base_size_h1b, gpuMemcpyHostToDevice,
stream);
Expand Down Expand Up @@ -289,11 +290,11 @@ void ccsd_t_fully_fused_none_df_none_task(
[&](sycl::handler& cgh) { cgh.host_task([=]() { hostEnergyReduce(reduceData); }); });
#endif

freeGpuMem(dev_evl_sorted_h1b);
freeGpuMem(dev_evl_sorted_h2b);
freeGpuMem(dev_evl_sorted_h3b);
freeGpuMem(dev_evl_sorted_p4b);
freeGpuMem(dev_evl_sorted_p5b);
freeGpuMem(dev_evl_sorted_p6b);
memDevPool.deallocate(dev_evl_sorted_h1b, sizeof(T) * base_size_h1b);
memDevPool.deallocate(dev_evl_sorted_h2b, sizeof(T) * base_size_h2b);
memDevPool.deallocate(dev_evl_sorted_h3b, sizeof(T) * base_size_h3b);
memDevPool.deallocate(dev_evl_sorted_p4b, sizeof(T) * base_size_p4b);
memDevPool.deallocate(dev_evl_sorted_p5b, sizeof(T) * base_size_p5b);
memDevPool.deallocate(dev_evl_sorted_p6b, sizeof(T) * base_size_p6b);
#endif
}
31 changes: 1 addition & 30 deletions exachem/cc/ccsd_t/ccsd_t_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cassert>
#include <cstdio>
#include <memory>
#include <new>
#include <string>

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
Expand All @@ -22,15 +23,6 @@ using event_ptr_t = std::shared_ptr<tamm::gpuEvent_t>;
#endif

#ifdef USE_CUDA
#define CHECK_ERR(x) \
{ \
cudaError_t err = cudaGetLastError(); \
if(cudaSuccess != err) { \
printf("%s\n", cudaGetErrorString(err)); \
exit(1); \
} \
}

#define CUDA_SAFE(x) \
if(cudaSuccess != (x)) { \
printf("CUDA API FAILED AT LINE %d OF FILE %s errorcode: %s, %s\n", __LINE__, __FILE__, \
Expand All @@ -40,15 +32,6 @@ using event_ptr_t = std::shared_ptr<tamm::gpuEvent_t>;
#endif // USE_CUDA

#ifdef USE_HIP
#define CHECK_ERR(x) \
{ \
hipError_t err = hipGetLastError(); \
if(hipSuccess != err) { \
printf("%s\n", hipGetErrorString(err)); \
exit(1); \
} \
}

#define HIP_SAFE(x) \
if(hipSuccess != (x)) { \
printf("HIP API FAILED AT LINE %d OF FILE %s errorcode: %s, %s\n", __LINE__, __FILE__, \
Expand All @@ -63,20 +46,8 @@ typedef long Integer;
#define DIV_UB(x, y) ((x) / (y) + ((x) % (y) ? 1 : 0))
#define TG_MIN(x, y) ((x) < (y) ? (x) : (y))

void initMemModule();
std::string check_memory_req(const int cc_t_ts, const int nbf);

void* getGpuMem(size_t bytes);
void* getPinnedMem(size_t bytes);
void* getHostMem(size_t bytes);
void* getPinnedMem(size_t bytes);
void freeHostMem(void* p);
void freePinnedMem(void* p);
void freeGpuMem(void* p);
void freePinnedMem(void* p);

void finalizeMemModule();

struct hostEnergyReduceData_t {
double* result_energy;
double* host_energies;
Expand Down
126 changes: 64 additions & 62 deletions exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
#include "ccsd_t_common.hpp"

namespace exachem::cc::ccsd_t {

void ccsd_t_driver(ExecutionContext& ec, ChemEnv& chem_env);
}
void finalizememmodule();

/**
* to check if target NVIDIA GPUs can support the fully-fused kernel
Expand Down Expand Up @@ -176,45 +174,49 @@ std::tuple<T, T, double, double> ccsd_t_fused_driver_new(
T* df_host_pinned_d2_t2{nullptr};
T* df_host_pinned_d2_v2{nullptr};

int* df_simple_s1_size = static_cast<int*>(getHostMem(sizeof(int) * (6)));
int* df_simple_s1_exec = static_cast<int*>(getHostMem(sizeof(int) * (9)));
int* df_simple_d1_size = static_cast<int*>(getHostMem(sizeof(int) * (7 * noab)));
int* df_simple_d1_exec = static_cast<int*>(getHostMem(sizeof(int) * (9 * noab)));
int* df_simple_d2_size = static_cast<int*>(getHostMem(sizeof(int) * (7 * nvab)));
int* df_simple_d2_exec = static_cast<int*>(getHostMem(sizeof(int) * (9 * nvab)));
int* df_simple_s1_size = static_cast<int*>(operator new[](6 * sizeof(int), std::nothrow));
int* df_simple_s1_exec = static_cast<int*>(operator new[](9 * sizeof(int), std::nothrow));
int* df_simple_d1_size = static_cast<int*>(operator new[](7 * noab * sizeof(int), std::nothrow));
int* df_simple_d1_exec = static_cast<int*>(operator new[](9 * noab * sizeof(int), std::nothrow));
int* df_simple_d2_size = static_cast<int*>(operator new[](7 * nvab * sizeof(int), std::nothrow));
int* df_simple_d2_exec = static_cast<int*>(operator new[](9 * nvab * sizeof(int), std::nothrow));

int* host_d1_size = static_cast<int*>(getHostMem(sizeof(int) * (noab)));
int* host_d2_size = static_cast<int*>(getHostMem(sizeof(int) * (nvab)));
int* host_d1_size = static_cast<int*>(operator new[](noab * sizeof(int), std::nothrow));
int* host_d2_size = static_cast<int*>(operator new[](nvab * sizeof(int), std::nothrow));

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
T* df_dev_s1_t1_all = static_cast<T*>(getGpuMem(sizeof(T) * size_T_s1_t1));
T* df_dev_s1_v2_all = static_cast<T*>(getGpuMem(sizeof(T) * size_T_s1_v2));
T* df_dev_d1_t2_all = static_cast<T*>(getGpuMem(sizeof(T) * size_T_d1_t2));
T* df_dev_d1_v2_all = static_cast<T*>(getGpuMem(sizeof(T) * size_T_d1_v2));
T* df_dev_d2_t2_all = static_cast<T*>(getGpuMem(sizeof(T) * size_T_d2_t2));
T* df_dev_d2_v2_all = static_cast<T*>(getGpuMem(sizeof(T) * size_T_d2_v2));

df_host_pinned_s1_t1 = static_cast<T*>(getPinnedMem(sizeof(T) * size_T_s1_t1));
df_host_pinned_s1_v2 = static_cast<T*>(getPinnedMem(sizeof(T) * size_T_s1_v2));
df_host_pinned_d1_t2 = static_cast<T*>(getPinnedMem(sizeof(T) * size_T_d1_t2));
df_host_pinned_d1_v2 = static_cast<T*>(getPinnedMem(sizeof(T) * size_T_d1_v2));
df_host_pinned_d2_t2 = static_cast<T*>(getPinnedMem(sizeof(T) * size_T_d2_t2));
df_host_pinned_d2_v2 = static_cast<T*>(getPinnedMem(sizeof(T) * size_T_d2_v2));
auto& memDevPool = RMMMemoryManager::getInstance().getDeviceMemoryPool();
T* df_dev_s1_t1_all = static_cast<T*>(memDevPool.allocate(sizeof(T) * size_T_s1_t1));
T* df_dev_s1_v2_all = static_cast<T*>(memDevPool.allocate(sizeof(T) * size_T_s1_v2));
T* df_dev_d1_t2_all = static_cast<T*>(memDevPool.allocate(sizeof(T) * size_T_d1_t2));
T* df_dev_d1_v2_all = static_cast<T*>(memDevPool.allocate(sizeof(T) * size_T_d1_v2));
T* df_dev_d2_t2_all = static_cast<T*>(memDevPool.allocate(sizeof(T) * size_T_d2_t2));
T* df_dev_d2_v2_all = static_cast<T*>(memDevPool.allocate(sizeof(T) * size_T_d2_v2));

df_host_pinned_s1_t1 = static_cast<T*>(tamm::getPinnedMem(sizeof(T) * size_T_s1_t1));
df_host_pinned_s1_v2 = static_cast<T*>(tamm::getPinnedMem(sizeof(T) * size_T_s1_v2));
df_host_pinned_d1_t2 = static_cast<T*>(tamm::getPinnedMem(sizeof(T) * size_T_d1_t2));
df_host_pinned_d1_v2 = static_cast<T*>(tamm::getPinnedMem(sizeof(T) * size_T_d1_v2));
df_host_pinned_d2_t2 = static_cast<T*>(tamm::getPinnedMem(sizeof(T) * size_T_d2_t2));
df_host_pinned_d2_v2 = static_cast<T*>(tamm::getPinnedMem(sizeof(T) * size_T_d2_v2));

#else // cpu
df_host_pinned_s1_t1 = static_cast<T*>(getHostMem(sizeof(T) * size_T_s1_t1));
df_host_pinned_s1_v2 = static_cast<T*>(getHostMem(sizeof(T) * size_T_s1_v2));
df_host_pinned_d1_t2 = static_cast<T*>(getHostMem(sizeof(T) * size_T_d1_t2));
df_host_pinned_d1_v2 = static_cast<T*>(getHostMem(sizeof(T) * size_T_d1_v2));
df_host_pinned_d2_t2 = static_cast<T*>(getHostMem(sizeof(T) * size_T_d2_t2));
df_host_pinned_d2_v2 = static_cast<T*>(getHostMem(sizeof(T) * size_T_d2_v2));
df_host_pinned_s1_t1 = static_cast<T*>(operator new[](size_T_s1_t1 * sizeof(T), std::nothrow));
df_host_pinned_s1_v2 = static_cast<T*>(operator new[](size_T_s1_v2 * sizeof(T), std::nothrow));
df_host_pinned_d1_t2 = static_cast<T*>(operator new[](size_T_d1_t2 * sizeof(T), std::nothrow));
df_host_pinned_d1_v2 = static_cast<T*>(operator new[](size_T_d1_v2 * sizeof(T), std::nothrow));
df_host_pinned_d2_t2 = static_cast<T*>(operator new[](size_T_d2_t2 * sizeof(T), std::nothrow));
df_host_pinned_d2_v2 = static_cast<T*>(operator new[](size_T_d2_v2 * sizeof(T), std::nothrow));
#endif

size_t max_num_blocks = chem_env.ioptions.ccsd_options.ccsdt_tilesize;
max_num_blocks = std::ceil((max_num_blocks + 4 - 1) / 4.0);

T* df_host_energies = static_cast<T*>(getHostMem(sizeof(T) * std::pow(max_num_blocks, 6) * 2));
T* df_host_energies =
static_cast<T*>(operator new[](std::pow(max_num_blocks, 6) * 2 * sizeof(T), std::nothrow));
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
T* df_dev_energies = static_cast<T*>(getGpuMem(sizeof(T) * std::pow(max_num_blocks, 6) * 2));
T* df_dev_energies =
static_cast<T*>(memDevPool.allocate(sizeof(T) * std::pow(max_num_blocks, 6) * 2));
#endif

#ifdef USE_DPCPP
Expand Down Expand Up @@ -456,43 +458,43 @@ std::tuple<T, T, double, double> ccsd_t_fused_driver_new(
energy1 = energy_l[0];
energy2 = energy_l[1];

freeHostMem(df_simple_s1_exec);
freeHostMem(df_simple_s1_size);
freeHostMem(df_simple_d1_exec);
freeHostMem(df_simple_d1_size);
freeHostMem(host_d1_size);
freeHostMem(df_simple_d2_exec);
freeHostMem(df_simple_d2_size);
freeHostMem(host_d2_size);
freeHostMem(df_host_energies);
operator delete[](df_simple_s1_exec);
operator delete[](df_simple_s1_size);
operator delete[](df_simple_d1_size);
operator delete[](df_simple_d1_exec);
operator delete[](df_simple_d2_size);
operator delete[](df_simple_d2_exec);

operator delete[](host_d1_size);
operator delete[](host_d2_size);

operator delete[](df_host_energies);

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
freeGpuMem(df_dev_s1_t1_all);
freeGpuMem(df_dev_s1_v2_all);
freeGpuMem(df_dev_d1_t2_all);
freeGpuMem(df_dev_d1_v2_all);
freeGpuMem(df_dev_d2_t2_all);
freeGpuMem(df_dev_d2_v2_all);
freeGpuMem(df_dev_energies);

freePinnedMem(df_host_pinned_s1_t1);
freePinnedMem(df_host_pinned_s1_v2);
freePinnedMem(df_host_pinned_d1_t2);
freePinnedMem(df_host_pinned_d1_v2);
freePinnedMem(df_host_pinned_d2_t2);
freePinnedMem(df_host_pinned_d2_v2);
memDevPool.deallocate(df_dev_s1_t1_all, sizeof(T) * size_T_s1_t1);
memDevPool.deallocate(df_dev_s1_v2_all, sizeof(T) * size_T_s1_v2);
memDevPool.deallocate(df_dev_d1_t2_all, sizeof(T) * size_T_d1_t2);
memDevPool.deallocate(df_dev_d1_v2_all, sizeof(T) * size_T_d1_v2);
memDevPool.deallocate(df_dev_d2_t2_all, sizeof(T) * size_T_d2_t2);
memDevPool.deallocate(df_dev_d2_v2_all, sizeof(T) * size_T_d2_v2);
memDevPool.deallocate(df_dev_energies, sizeof(T) * std::pow(max_num_blocks, 6) * 2);

tamm::freePinnedMem(df_host_pinned_s1_t1);
tamm::freePinnedMem(df_host_pinned_s1_v2);
tamm::freePinnedMem(df_host_pinned_d1_t2);
tamm::freePinnedMem(df_host_pinned_d1_v2);
tamm::freePinnedMem(df_host_pinned_d2_t2);
tamm::freePinnedMem(df_host_pinned_d2_v2);

#else // cpu
freeHostMem(df_host_pinned_s1_t1);
freeHostMem(df_host_pinned_s1_v2);
freeHostMem(df_host_pinned_d1_t2);
freeHostMem(df_host_pinned_d1_v2);
freeHostMem(df_host_pinned_d2_t2);
freeHostMem(df_host_pinned_d2_v2);
operator delete[](df_host_pinned_s1_t1);
operator delete[](df_host_pinned_s1_v2);
operator delete[](df_host_pinned_d1_t2);
operator delete[](df_host_pinned_d1_v2);
operator delete[](df_host_pinned_d2_t2);
operator delete[](df_host_pinned_d2_v2);
#endif

finalizememmodule();

auto cc_t2 = std::chrono::high_resolution_clock::now();
auto ccsd_t_time =
std::chrono::duration_cast<std::chrono::duration<double>>((cc_t2 - cc_t1)).count();
Expand Down

0 comments on commit 88d1ada

Please sign in to comment.