diff --git a/exachem/cc/ccsd_t/ccsd_t.cmake b/exachem/cc/ccsd_t/ccsd_t.cmake index e0bb35f..d662bd2 100644 --- a/exachem/cc/ccsd_t/ccsd_t.cmake +++ b/exachem/cc/ccsd_t/ccsd_t.cmake @@ -4,7 +4,6 @@ include(TargetMacros) set(CCSD_T_SRCDIR ${CMAKE_CURRENT_SOURCE_DIR}/../exachem/cc/ccsd_t) set(CCSD_T_COMMON_SRCS ${CCSD_T_SRCDIR}/ccsd_t.cpp - ${CCSD_T_SRCDIR}/memory.cpp ${CCSD_T_SRCDIR}/ccsd_t_common.hpp ${CCSD_T_SRCDIR}/hybrid.cpp ${CCSD_T_SRCDIR}/ccsd_t_fused_driver.hpp diff --git a/exachem/cc/ccsd_t/ccsd_t.cpp b/exachem/cc/ccsd_t/ccsd_t.cpp index 4e289f7..2052ea8 100644 --- a/exachem/cc/ccsd_t/ccsd_t.cpp +++ b/exachem/cc/ccsd_t/ccsd_t.cpp @@ -463,7 +463,7 @@ void exachem::cc::ccsd_t::ccsd_t_driver(ExecutionContext& ec, ChemEnv& chem_env) // Given the singleton pool created by the TAMM is not used by the (T) kernel calculation. // We artifically destroy the pool tamm::reset_rmm_pool(); - // tamm::reinitialize_rmm_pool(); + tamm::reinitialize_rmm_pool(); std::string dev_str = "[CPU]"; #if defined(USE_CUDA) diff --git a/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp b/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp index 5b1090c..c58833a 100644 --- a/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp +++ b/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp @@ -188,12 +188,13 @@ void ccsd_t_fully_fused_none_df_none_task( #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) if(!gpuEventQuery(*done_compute)) { gpuEventSynchronize(*done_compute); } - T* dev_evl_sorted_h1b = static_cast(getGpuMem(sizeof(T) * base_size_h1b)); - T* dev_evl_sorted_h2b = static_cast(getGpuMem(sizeof(T) * base_size_h2b)); - T* dev_evl_sorted_h3b = static_cast(getGpuMem(sizeof(T) * base_size_h3b)); - T* dev_evl_sorted_p4b = static_cast(getGpuMem(sizeof(T) * base_size_p4b)); - T* dev_evl_sorted_p5b = static_cast(getGpuMem(sizeof(T) * base_size_p5b)); - T* dev_evl_sorted_p6b = static_cast(getGpuMem(sizeof(T) * base_size_p6b)); + auto& memDevPool = RMMMemoryManager::getInstance().getDeviceMemoryPool(); + T* dev_evl_sorted_h1b = static_cast(memDevPool.allocate(sizeof(T) * base_size_h1b)); + T* dev_evl_sorted_h2b = static_cast(memDevPool.allocate(sizeof(T) * base_size_h2b)); + T* dev_evl_sorted_h3b = static_cast(memDevPool.allocate(sizeof(T) * base_size_h3b)); + T* dev_evl_sorted_p4b = static_cast(memDevPool.allocate(sizeof(T) * base_size_p4b)); + T* dev_evl_sorted_p5b = static_cast(memDevPool.allocate(sizeof(T) * base_size_p5b)); + T* dev_evl_sorted_p6b = static_cast(memDevPool.allocate(sizeof(T) * base_size_p6b)); gpuMemcpyAsync(dev_evl_sorted_h1b, host_evl_sorted_h1b, base_size_h1b, gpuMemcpyHostToDevice, stream); @@ -289,11 +290,11 @@ void ccsd_t_fully_fused_none_df_none_task( [&](sycl::handler& cgh) { cgh.host_task([=]() { hostEnergyReduce(reduceData); }); }); #endif - freeGpuMem(dev_evl_sorted_h1b); - freeGpuMem(dev_evl_sorted_h2b); - freeGpuMem(dev_evl_sorted_h3b); - freeGpuMem(dev_evl_sorted_p4b); - freeGpuMem(dev_evl_sorted_p5b); - freeGpuMem(dev_evl_sorted_p6b); + memDevPool.deallocate(dev_evl_sorted_h1b, sizeof(T) * base_size_h1b); + memDevPool.deallocate(dev_evl_sorted_h2b, sizeof(T) * base_size_h2b); + memDevPool.deallocate(dev_evl_sorted_h3b, sizeof(T) * base_size_h3b); + memDevPool.deallocate(dev_evl_sorted_p4b, sizeof(T) * base_size_p4b); + memDevPool.deallocate(dev_evl_sorted_p5b, sizeof(T) * base_size_p5b); + memDevPool.deallocate(dev_evl_sorted_p6b, sizeof(T) * base_size_p6b); #endif } diff --git a/exachem/cc/ccsd_t/ccsd_t_common.hpp b/exachem/cc/ccsd_t/ccsd_t_common.hpp index 04def7e..cbf6e29 100644 --- a/exachem/cc/ccsd_t/ccsd_t_common.hpp +++ b/exachem/cc/ccsd_t/ccsd_t_common.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) @@ -22,15 +23,6 @@ using event_ptr_t = std::shared_ptr; #endif #ifdef USE_CUDA -#define CHECK_ERR(x) \ - { \ - cudaError_t err = cudaGetLastError(); \ - if(cudaSuccess != err) { \ - printf("%s\n", cudaGetErrorString(err)); \ - exit(1); \ - } \ - } - #define CUDA_SAFE(x) \ if(cudaSuccess != (x)) { \ printf("CUDA API FAILED AT LINE %d OF FILE %s errorcode: %s, %s\n", __LINE__, __FILE__, \ @@ -40,15 +32,6 @@ using event_ptr_t = std::shared_ptr; #endif // USE_CUDA #ifdef USE_HIP -#define CHECK_ERR(x) \ - { \ - hipError_t err = hipGetLastError(); \ - if(hipSuccess != err) { \ - printf("%s\n", hipGetErrorString(err)); \ - exit(1); \ - } \ - } - #define HIP_SAFE(x) \ if(hipSuccess != (x)) { \ printf("HIP API FAILED AT LINE %d OF FILE %s errorcode: %s, %s\n", __LINE__, __FILE__, \ @@ -63,20 +46,8 @@ typedef long Integer; #define DIV_UB(x, y) ((x) / (y) + ((x) % (y) ? 1 : 0)) #define TG_MIN(x, y) ((x) < (y) ? (x) : (y)) -void initMemModule(); std::string check_memory_req(const int cc_t_ts, const int nbf); -void* getGpuMem(size_t bytes); -void* getPinnedMem(size_t bytes); -void* getHostMem(size_t bytes); -void* getPinnedMem(size_t bytes); -void freeHostMem(void* p); -void freePinnedMem(void* p); -void freeGpuMem(void* p); -void freePinnedMem(void* p); - -void finalizeMemModule(); - struct hostEnergyReduceData_t { double* result_energy; double* host_energies; diff --git a/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp b/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp index 7204bde..e2ef8a2 100644 --- a/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp +++ b/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp @@ -17,10 +17,8 @@ #include "ccsd_t_common.hpp" namespace exachem::cc::ccsd_t { - void ccsd_t_driver(ExecutionContext& ec, ChemEnv& chem_env); } -void finalizememmodule(); /** * to check if target NVIDIA GPUs can support the fully-fused kernel @@ -176,45 +174,49 @@ std::tuple ccsd_t_fused_driver_new( T* df_host_pinned_d2_t2{nullptr}; T* df_host_pinned_d2_v2{nullptr}; - int* df_simple_s1_size = static_cast(getHostMem(sizeof(int) * (6))); - int* df_simple_s1_exec = static_cast(getHostMem(sizeof(int) * (9))); - int* df_simple_d1_size = static_cast(getHostMem(sizeof(int) * (7 * noab))); - int* df_simple_d1_exec = static_cast(getHostMem(sizeof(int) * (9 * noab))); - int* df_simple_d2_size = static_cast(getHostMem(sizeof(int) * (7 * nvab))); - int* df_simple_d2_exec = static_cast(getHostMem(sizeof(int) * (9 * nvab))); + int* df_simple_s1_size = static_cast(operator new[](6 * sizeof(int), std::nothrow)); + int* df_simple_s1_exec = static_cast(operator new[](9 * sizeof(int), std::nothrow)); + int* df_simple_d1_size = static_cast(operator new[](7 * noab * sizeof(int), std::nothrow)); + int* df_simple_d1_exec = static_cast(operator new[](9 * noab * sizeof(int), std::nothrow)); + int* df_simple_d2_size = static_cast(operator new[](7 * nvab * sizeof(int), std::nothrow)); + int* df_simple_d2_exec = static_cast(operator new[](9 * nvab * sizeof(int), std::nothrow)); - int* host_d1_size = static_cast(getHostMem(sizeof(int) * (noab))); - int* host_d2_size = static_cast(getHostMem(sizeof(int) * (nvab))); + int* host_d1_size = static_cast(operator new[](noab * sizeof(int), std::nothrow)); + int* host_d2_size = static_cast(operator new[](nvab * sizeof(int), std::nothrow)); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - T* df_dev_s1_t1_all = static_cast(getGpuMem(sizeof(T) * size_T_s1_t1)); - T* df_dev_s1_v2_all = static_cast(getGpuMem(sizeof(T) * size_T_s1_v2)); - T* df_dev_d1_t2_all = static_cast(getGpuMem(sizeof(T) * size_T_d1_t2)); - T* df_dev_d1_v2_all = static_cast(getGpuMem(sizeof(T) * size_T_d1_v2)); - T* df_dev_d2_t2_all = static_cast(getGpuMem(sizeof(T) * size_T_d2_t2)); - T* df_dev_d2_v2_all = static_cast(getGpuMem(sizeof(T) * size_T_d2_v2)); - - df_host_pinned_s1_t1 = static_cast(getPinnedMem(sizeof(T) * size_T_s1_t1)); - df_host_pinned_s1_v2 = static_cast(getPinnedMem(sizeof(T) * size_T_s1_v2)); - df_host_pinned_d1_t2 = static_cast(getPinnedMem(sizeof(T) * size_T_d1_t2)); - df_host_pinned_d1_v2 = static_cast(getPinnedMem(sizeof(T) * size_T_d1_v2)); - df_host_pinned_d2_t2 = static_cast(getPinnedMem(sizeof(T) * size_T_d2_t2)); - df_host_pinned_d2_v2 = static_cast(getPinnedMem(sizeof(T) * size_T_d2_v2)); + auto& memDevPool = RMMMemoryManager::getInstance().getDeviceMemoryPool(); + T* df_dev_s1_t1_all = static_cast(memDevPool.allocate(sizeof(T) * size_T_s1_t1)); + T* df_dev_s1_v2_all = static_cast(memDevPool.allocate(sizeof(T) * size_T_s1_v2)); + T* df_dev_d1_t2_all = static_cast(memDevPool.allocate(sizeof(T) * size_T_d1_t2)); + T* df_dev_d1_v2_all = static_cast(memDevPool.allocate(sizeof(T) * size_T_d1_v2)); + T* df_dev_d2_t2_all = static_cast(memDevPool.allocate(sizeof(T) * size_T_d2_t2)); + T* df_dev_d2_v2_all = static_cast(memDevPool.allocate(sizeof(T) * size_T_d2_v2)); + + df_host_pinned_s1_t1 = static_cast(tamm::getPinnedMem(sizeof(T) * size_T_s1_t1)); + df_host_pinned_s1_v2 = static_cast(tamm::getPinnedMem(sizeof(T) * size_T_s1_v2)); + df_host_pinned_d1_t2 = static_cast(tamm::getPinnedMem(sizeof(T) * size_T_d1_t2)); + df_host_pinned_d1_v2 = static_cast(tamm::getPinnedMem(sizeof(T) * size_T_d1_v2)); + df_host_pinned_d2_t2 = static_cast(tamm::getPinnedMem(sizeof(T) * size_T_d2_t2)); + df_host_pinned_d2_v2 = static_cast(tamm::getPinnedMem(sizeof(T) * size_T_d2_v2)); + #else // cpu - df_host_pinned_s1_t1 = static_cast(getHostMem(sizeof(T) * size_T_s1_t1)); - df_host_pinned_s1_v2 = static_cast(getHostMem(sizeof(T) * size_T_s1_v2)); - df_host_pinned_d1_t2 = static_cast(getHostMem(sizeof(T) * size_T_d1_t2)); - df_host_pinned_d1_v2 = static_cast(getHostMem(sizeof(T) * size_T_d1_v2)); - df_host_pinned_d2_t2 = static_cast(getHostMem(sizeof(T) * size_T_d2_t2)); - df_host_pinned_d2_v2 = static_cast(getHostMem(sizeof(T) * size_T_d2_v2)); + df_host_pinned_s1_t1 = static_cast(operator new[](size_T_s1_t1 * sizeof(T), std::nothrow)); + df_host_pinned_s1_v2 = static_cast(operator new[](size_T_s1_v2 * sizeof(T), std::nothrow)); + df_host_pinned_d1_t2 = static_cast(operator new[](size_T_d1_t2 * sizeof(T), std::nothrow)); + df_host_pinned_d1_v2 = static_cast(operator new[](size_T_d1_v2 * sizeof(T), std::nothrow)); + df_host_pinned_d2_t2 = static_cast(operator new[](size_T_d2_t2 * sizeof(T), std::nothrow)); + df_host_pinned_d2_v2 = static_cast(operator new[](size_T_d2_v2 * sizeof(T), std::nothrow)); #endif size_t max_num_blocks = chem_env.ioptions.ccsd_options.ccsdt_tilesize; max_num_blocks = std::ceil((max_num_blocks + 4 - 1) / 4.0); - T* df_host_energies = static_cast(getHostMem(sizeof(T) * std::pow(max_num_blocks, 6) * 2)); + T* df_host_energies = + static_cast(operator new[](std::pow(max_num_blocks, 6) * 2 * sizeof(T), std::nothrow)); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - T* df_dev_energies = static_cast(getGpuMem(sizeof(T) * std::pow(max_num_blocks, 6) * 2)); + T* df_dev_energies = + static_cast(memDevPool.allocate(sizeof(T) * std::pow(max_num_blocks, 6) * 2)); #endif #ifdef USE_DPCPP @@ -456,43 +458,43 @@ std::tuple ccsd_t_fused_driver_new( energy1 = energy_l[0]; energy2 = energy_l[1]; - freeHostMem(df_simple_s1_exec); - freeHostMem(df_simple_s1_size); - freeHostMem(df_simple_d1_exec); - freeHostMem(df_simple_d1_size); - freeHostMem(host_d1_size); - freeHostMem(df_simple_d2_exec); - freeHostMem(df_simple_d2_size); - freeHostMem(host_d2_size); - freeHostMem(df_host_energies); + operator delete[](df_simple_s1_exec); + operator delete[](df_simple_s1_size); + operator delete[](df_simple_d1_size); + operator delete[](df_simple_d1_exec); + operator delete[](df_simple_d2_size); + operator delete[](df_simple_d2_exec); + + operator delete[](host_d1_size); + operator delete[](host_d2_size); + + operator delete[](df_host_energies); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - freeGpuMem(df_dev_s1_t1_all); - freeGpuMem(df_dev_s1_v2_all); - freeGpuMem(df_dev_d1_t2_all); - freeGpuMem(df_dev_d1_v2_all); - freeGpuMem(df_dev_d2_t2_all); - freeGpuMem(df_dev_d2_v2_all); - freeGpuMem(df_dev_energies); - - freePinnedMem(df_host_pinned_s1_t1); - freePinnedMem(df_host_pinned_s1_v2); - freePinnedMem(df_host_pinned_d1_t2); - freePinnedMem(df_host_pinned_d1_v2); - freePinnedMem(df_host_pinned_d2_t2); - freePinnedMem(df_host_pinned_d2_v2); + memDevPool.deallocate(df_dev_s1_t1_all, sizeof(T) * size_T_s1_t1); + memDevPool.deallocate(df_dev_s1_v2_all, sizeof(T) * size_T_s1_v2); + memDevPool.deallocate(df_dev_d1_t2_all, sizeof(T) * size_T_d1_t2); + memDevPool.deallocate(df_dev_d1_v2_all, sizeof(T) * size_T_d1_v2); + memDevPool.deallocate(df_dev_d2_t2_all, sizeof(T) * size_T_d2_t2); + memDevPool.deallocate(df_dev_d2_v2_all, sizeof(T) * size_T_d2_v2); + memDevPool.deallocate(df_dev_energies, sizeof(T) * std::pow(max_num_blocks, 6) * 2); + + tamm::freePinnedMem(df_host_pinned_s1_t1); + tamm::freePinnedMem(df_host_pinned_s1_v2); + tamm::freePinnedMem(df_host_pinned_d1_t2); + tamm::freePinnedMem(df_host_pinned_d1_v2); + tamm::freePinnedMem(df_host_pinned_d2_t2); + tamm::freePinnedMem(df_host_pinned_d2_v2); #else // cpu -freeHostMem(df_host_pinned_s1_t1); -freeHostMem(df_host_pinned_s1_v2); -freeHostMem(df_host_pinned_d1_t2); -freeHostMem(df_host_pinned_d1_v2); -freeHostMem(df_host_pinned_d2_t2); -freeHostMem(df_host_pinned_d2_v2); +operator delete[](df_host_pinned_s1_t1); +operator delete[](df_host_pinned_s1_v2); +operator delete[](df_host_pinned_d1_t2); +operator delete[](df_host_pinned_d1_v2); +operator delete[](df_host_pinned_d2_t2); +operator delete[](df_host_pinned_d2_v2); #endif - finalizememmodule(); - auto cc_t2 = std::chrono::high_resolution_clock::now(); auto ccsd_t_time = std::chrono::duration_cast>((cc_t2 - cc_t1)).count();