diff --git a/src/acc/fasten.hpp b/src/acc/fasten.hpp index c40a38f..35f3494 100644 --- a/src/acc/fasten.hpp +++ b/src/acc/fasten.hpp @@ -146,7 +146,7 @@ template class IMPL_CLS final : public Bude { std::vector devices; acc_device_t device_type = acc_get_device_type(); for (int i = 0; i < acc_get_num_devices(device_type); ++i) { - devices.template emplace_back(i, "OMP target device #" + std::to_string(i)); + devices.template emplace_back(i, "ACC target device #" + std::to_string(i)); } return devices; }; @@ -158,78 +158,77 @@ template class IMPL_CLS final : public Bude { Sample sample(PPWI, wgsize, p.nposes()); - auto contextStart = now(); - - std::array poses{}; - auto protein = static_cast(std::malloc(sizeof(Atom) * p.natpro())); - auto ligand = static_cast(std::malloc(sizeof(Atom) * p.natlig())); - auto forcefield = static_cast(std::malloc(sizeof(FFParams) * p.ntypes())); - auto energies = static_cast(std::malloc(sizeof(float) * p.nposes())); - - for (size_t i = 0; i < 6; i++) { - poses[i] = static_cast(std::malloc(sizeof(float) * p.nposes())); - std::copy(p.poses[i].begin(), p.poses[i].end(), poses[i]); - } - - std::copy(p.protein.begin(), p.protein.end(), protein); - std::copy(p.ligand.begin(), p.ligand.end(), ligand); - std::copy(p.forcefield.begin(), p.forcefield.end(), forcefield); - std::fill(energies, energies + p.nposes(), 0.0); - - auto contextEnd = now(); - sample.contextTime = {contextStart, contextEnd}; - - auto poses_0 = poses[0]; - auto poses_1 = poses[1]; - auto poses_2 = poses[2]; - auto poses_3 = poses[3]; - auto poses_4 = poses[4]; - auto poses_5 = poses[5]; const auto ntypes = p.ntypes(); const auto nposes = p.nposes(); const auto natlig = p.natlig(); const auto natpro = p.natpro(); -// clang-format off -#pragma acc data \ - copyin( \ - protein[:natpro], ligand[:natlig], \ - forcefield[:ntypes], poses_0[:nposes], \ - poses_1[:nposes], poses_2[:nposes], \ - poses_3[:nposes], poses_4[:nposes], \ - poses_5[:nposes]) \ - copyout(energies[:nposes]) -{ + auto poses = p.poses.data(); + auto protein = p.protein.data(); + auto ligand = p.ligand.data(); + auto forcefield = p.forcefield.data(); + auto energies = sample.energies.data(); + + auto poses_0 = poses[0].data(); + auto poses_1 = poses[1].data(); + auto poses_2 = poses[2].data(); + auto poses_3 = poses[3].data(); + auto poses_4 = poses[4].data(); + auto poses_5 = poses[5].data(); + + // clang-format off + auto hostToDeviceStart = now(); + + #pragma acc enter data \ + copyin( \ + protein[:natpro], ligand[:natlig], \ + forcefield[:ntypes], poses_0[:nposes], \ + poses_1[:nposes], poses_2[:nposes], \ + poses_3[:nposes], poses_4[:nposes], \ + poses_5[:nposes]) \ + + auto hostToDeviceEnd = now(); + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; + + #pragma acc enter data \ + create(energies[:nposes]) + + // clang-format on + for (size_t i = 0; i < p.totalIterations(); ++i) { + auto kernelStart = now(); + + // clang-format off + #pragma acc parallel loop \ + present( \ + protein[:natpro], ligand[:natlig], \ + forcefield[:ntypes], poses_0[:nposes], \ + poses_1[:nposes], poses_2[:nposes], \ + poses_3[:nposes], poses_4[:nposes], \ + poses_5[:nposes], energies[:nposes]) // clang-format on - for (size_t i = 0; i < p.totalIterations(); ++i) { - auto kernelStart = now(); - -// clang-format off -#pragma acc parallel loop \ - present( \ - protein[:natpro], ligand[:natlig], \ - forcefield[:ntypes], poses_0[:nposes], \ - poses_1[:nposes], poses_2[:nposes], \ - poses_3[:nposes], poses_4[:nposes], \ - poses_5[:nposes], energies[:nposes]) wait - // clang-format on - for (size_t group = 0; group < (nposes / PPWI); group++) { - fasten_main(group, ntypes, nposes, natlig, natpro, // - protein, ligand, // - poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, // - forcefield, energies); - } - auto kernelEnd = now(); - sample.kernelTimes.emplace_back(kernelStart, kernelEnd); + for (size_t group = 0; group < (nposes / PPWI); group++) { + fasten_main(group, ntypes, nposes, natlig, natpro, // + protein, ligand, // + poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, // + forcefield, energies); } + auto kernelEnd = now(); + sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } - std::copy(energies, energies + p.nposes(), sample.energies.begin()); - std::free(protein); - std::free(ligand); - std::free(forcefield); - std::free(energies); - for (auto &pose : poses) - std::free(pose); + + auto deviceToHostStart = now(); + + #pragma acc update \ + host(energies[:nposes]) + + auto deviceToHostEnd = now(); + sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; + + #pragma acc exit data \ + delete(protein, ligand, forcefield, \ + poses_0, poses_1, poses_2, \ + poses_3, poses_4, poses_5, \ + energies) return sample; }; diff --git a/src/acc/model.cmake b/src/acc/model.cmake index a85ea03..c40b36c 100644 --- a/src/acc/model.cmake +++ b/src/acc/model.cmake @@ -43,6 +43,11 @@ register_flag_optional(TARGET_PROCESSOR Refer to `nvc++ --help` for the full list" "") +register_flag_optional(OFFLOAD_FLAGS + "OpenACC Offload Flags" + "" +) + set(ACC_FLAGS_OFFLOAD_GNU -foffload=-lm) macro(setup) @@ -89,5 +94,9 @@ macro(setup) endif () + separate_arguments(OFFLOAD_FLAGS) + register_append_cxx_flags(ANY ${OFFLOAD_FLAGS}) + register_append_link_flags(${OFFLOAD_FLAGS}) + endmacro() diff --git a/src/bude.h b/src/bude.h index c2b0d91..190c656 100644 --- a/src/bude.h +++ b/src/bude.h @@ -10,6 +10,7 @@ #define DIFF_TOLERANCE_PCT 0.025f #define DEFAULT_ITERS 8 +#define DEFAULT_WARMUPS 2 #define DEFAULT_ENERGY_ENTRIES 8 #define DATA_DIR "../data/bm1" @@ -61,6 +62,7 @@ struct Params { size_t maxPoses, iterations, warmupIterations, outRows; std::string deckDir, output, deviceSelector; bool csv; + std::string csv_filename; bool list; @@ -85,9 +87,10 @@ struct Sample { size_t ppwi, wgsize; std::vector energies; std::vector> kernelTimes; - std::optional> contextTime; + std::optional> hostToDevice; + std::optional> deviceToHost; Sample(size_t ppwi, size_t wgsize, size_t nposes) - : ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), contextTime() {} + : ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), hostToDevice(), deviceToHost() {} }; using Device = std::pair; diff --git a/src/cuda/fasten.hpp b/src/cuda/fasten.hpp index 39927a4..8707b53 100644 --- a/src/cuda/fasten.hpp +++ b/src/cuda/fasten.hpp @@ -253,7 +253,7 @@ template class IMPL_CLS final : public Bude { Sample sample(PPWI, wgsize, p.nposes()); - auto contextStart = now(); + auto hostToDeviceStart = now(); auto protein = allocate(p.protein); auto ligand = allocate(p.ligand); auto transforms_0 = allocate(p.poses[0]); @@ -263,12 +263,14 @@ template class IMPL_CLS final : public Bude { auto transforms_4 = allocate(p.poses[4]); auto transforms_5 = allocate(p.poses[5]); auto forcefield = allocate(p.forcefield); - auto results = allocate(sample.energies.size()); checkError(cudaDeviceSynchronize()); - auto contextEnd = now(); + auto hostToDeviceEnd = now(); - sample.contextTime = {contextStart, contextEnd}; + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; + auto results = allocate(sample.energies.size()); + checkError(cudaDeviceSynchronize()); + size_t global = std::ceil(double(p.nposes()) / PPWI); global = std::ceil(double(global) / double(wgsize)); size_t local = wgsize; @@ -285,8 +287,13 @@ template class IMPL_CLS final : public Bude { sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } + auto deviceToHostStart = now(); + checkError( cudaMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), cudaMemcpyDeviceToHost)); + + auto deviceToHostEnd = now(); + sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; free(protein); free(ligand); diff --git a/src/hip/fasten.hpp b/src/hip/fasten.hpp index a3c5973..a77ceb5 100644 --- a/src/hip/fasten.hpp +++ b/src/hip/fasten.hpp @@ -215,7 +215,7 @@ template class IMPL_CLS final : public Bude { Sample sample(PPWI, wgsize, p.nposes()); - auto contextStart = now(); + auto hostToDeviceStart = now(); auto protein = allocate(p.protein); auto ligand = allocate(p.ligand); auto transforms_0 = allocate(p.poses[0]); @@ -225,12 +225,14 @@ template class IMPL_CLS final : public Bude { auto transforms_4 = allocate(p.poses[4]); auto transforms_5 = allocate(p.poses[5]); auto forcefield = allocate(p.forcefield); - auto results = allocate(sample.energies.size()); checkError(hipDeviceSynchronize()); - auto contextEnd = now(); + auto hostToDeviceEnd = now(); - sample.contextTime = {contextStart, contextEnd}; + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; + auto results = allocate(sample.energies.size()); + checkError(hipDeviceSynchronize()); + size_t global = std::ceil(double(p.nposes()) / PPWI); global = std::ceil(double(global) / double(wgsize)); size_t local = wgsize; @@ -247,8 +249,12 @@ template class IMPL_CLS final : public Bude { sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } + auto deviceToHostStart = now(); checkError( hipMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), hipMemcpyDeviceToHost)); + auto deviceToHostEnd = now(); + + sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; free(protein); free(ligand); diff --git a/src/kokkos/fasten.hpp b/src/kokkos/fasten.hpp index 1bd4ffc..3772f34 100644 --- a/src/kokkos/fasten.hpp +++ b/src/kokkos/fasten.hpp @@ -172,10 +172,9 @@ template class IMPL_CLS final : public Bude { } template static Kokkos::View mkView(const std::string &name, const std::vector &xs) { - Kokkos::View view(name, xs.size()); - auto mirror = Kokkos::create_mirror_view(view); - for (size_t i = 0; i < xs.size(); i++) - mirror[i] = xs[i]; + Kokkos::View> mirror (std::data(xs), std::size(xs)); + Kokkos::View view (name, std::size(xs)); Kokkos::deep_copy(view, mirror); return view; } @@ -195,10 +194,10 @@ template class IMPL_CLS final : public Bude { if (!Kokkos::is_initialized()) { Kokkos::initialize(); } - + Sample sample(PPWI, wgsize, p.nposes()); - auto contextStart = now(); + auto hostToDeviceStart = now(); auto protein = mkView("protein", p.protein); auto ligand = mkView("ligand", p.ligand); @@ -209,12 +208,15 @@ template class IMPL_CLS final : public Bude { auto transforms_4 = mkView("transforms_4", p.poses[4]); auto transforms_5 = mkView("transforms_5", p.poses[5]); auto forcefield = mkView("forcefield", p.forcefield); - Kokkos::View results("results", sample.energies.size()); Kokkos::fence(); - auto contextEnd = now(); - sample.contextTime = {contextStart, contextEnd}; - for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) { + auto hostToDeviceEnd = now(); + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; + + Kokkos::View results(Kokkos::ViewAllocateWithoutInitializing("results"), sample.energies.size()); + Kokkos::fence(); + + for (size_t i = 0; i < p.totalIterations(); ++i) { auto kernelStart = now(); fasten_main(wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), // protein, ligand, forcefield, // @@ -224,8 +226,14 @@ template class IMPL_CLS final : public Bude { sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } + auto deviceToHostStart = now(); + auto result_mirror = Kokkos::create_mirror_view(results); Kokkos::deep_copy(result_mirror, results); + + auto deviceToHostEnd = now(); + sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; + for (size_t i = 0; i < results.size(); i++) { sample.energies[i] = result_mirror[i]; } diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index e1a71f8..a38a734 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -4,13 +4,15 @@ register_flag_optional(CMAKE_CXX_COMPILER See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" "c++") -register_flag_required(KOKKOS_IN_TREE +register_flag_optional(KOKKOS_IN_TREE "Absolute path to the *source* distribution directory of Kokkos. Remember to append Kokkos specific flags as well, for example: - -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... + See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") - See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options") +register_flag_optional(KOKKOS_IN_PACKAGE + "Absolute path to package R-Path containing Kokkos libs. + Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "") # compiler vendor and arch specific flags set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) @@ -19,15 +21,22 @@ macro(setup) cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_EXTENSIONS OFF) + message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") - if (EXISTS "${KOKKOS_IN_TREE}") + if (KOKKOS_IN_TREE) add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) register_link_library(Kokkos::kokkos) + elseif (KOKKOS_IN_PACKAGE) + message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") + find_package(Kokkos REQUIRED) + register_link_library(Kokkos::kokkos) else () - message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist") + message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!") endif () - + register_append_compiler_and_arch_specific_cxx_flags( KOKKOS_FLAGS_CPU ${CMAKE_CXX_COMPILER_ID} diff --git a/src/main.cpp b/src/main.cpp index 58c62b9..87de77c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -157,7 +157,7 @@ parseParams(const std::vector &args) { // Defaults params.iterations = DEFAULT_ITERS; - params.warmupIterations = 2; + params.warmupIterations = DEFAULT_WARMUPS; params.deckDir = DATA_DIR; params.outRows = DEFAULT_ENERGY_ENTRIES; @@ -212,10 +212,12 @@ parseParams(const std::vector &args) { using namespace std::placeholders; const auto arg = args[i]; if (read(i, arg, {"--iter", "-i"}, [&](auto &&s) { return bindInt(s, params.iterations, "iter"); })) continue; + if (read(i, arg, {"--warmups", "-w"}, [&](auto &&s) { return bindInt(s, params.warmupIterations, "iter"); })) continue; if (read(i, arg, {"--poses", "-n"}, [&](auto &&s) { bindInt(s, nposes, "poses"); })) continue; if (read(i, arg, {"--device", "-d"}, [&](auto &&s) { params.deviceSelector = s; })) continue; if (read(i, arg, {"--deck"}, [&](auto &&s) { params.deckDir = s; })) continue; if (read(i, arg, {"--out", "-o"}, [&](auto &&s) { params.output = s; })) continue; + if (read(i, arg, {"--csv"}, [&](auto &&s) { params.csv = true; params.csv_filename = s; })) continue; if (read(i, arg, {"--rows", "-r"}, [&](auto &&s) { return bindInt(s, params.outRows, "rows"); })) continue; if (read(i, arg, {"--wgsize", "-w"}, [&](auto &&s) { bindInts(s, wgsizes, "wgsize"); })) continue; if (read(i, arg, {"--ppwi", "-p"}, [&](auto &&s) { @@ -232,11 +234,6 @@ parseParams(const std::vector &args) { })) continue; - if (arg == "--csv") { - params.csv = true; - continue; - } - if (arg == "list" || arg == "--list" || arg == "-l") { params.list = true; continue; @@ -255,6 +252,8 @@ parseParams(const std::vector &args) { " [optional] default=0\n" << " -i --iter I Repeat kernel I times\n" " [optional] default=" << DEFAULT_ITERS << "\n" + << " -w --warmups W # of warmup iterations\n" + " [optional] default=" << DEFAULT_WARMUPS << "\n" << " -n --poses N Compute energies for only N poses, use 0 for deck max\n" " [optional] default=0 \n" << " -p --ppwi PPWI A CSV list of poses per work-item for the kernel, use `all` for everything\n" @@ -267,7 +266,7 @@ parseParams(const std::vector &args) { " [optional]\n" << " -r --rows N Output first N row(s) of energy values as part of the on-screen result\n" " [optional] default=" << DEFAULT_ENERGY_ENTRIES << "\n" - << " --csv Output results in CSV format\n" + << " --csv PATH Output results in CSV format\n" " [optional] default=false" << std::endl; @@ -429,10 +428,12 @@ void showHumanReadable(const Params &p, const Result &r, int indent = 1) { std::string prefix(indent, ' '); std::cout.precision(3); - auto contextMs = r.sample.contextTime - ? std::to_string(elapsedMillis(r.sample.contextTime->first, r.sample.contextTime->second)) + auto hostToDeviceMs = r.sample.hostToDevice + ? std::to_string(elapsedMillis(r.sample.hostToDevice->first, r.sample.hostToDevice->second)) + : "~"; + auto deviceToHostMs = r.sample.deviceToHost + ? std::to_string(elapsedMillis(r.sample.deviceToHost->first, r.sample.deviceToHost->second)) : "~"; - std::vector iterationTimesMs; std::transform(r.sample.kernelTimes.begin(), // r.sample.kernelTimes.end(), std::back_inserter(iterationTimesMs), @@ -446,7 +447,8 @@ void showHumanReadable(const Params &p, const Result &r, int indent = 1) { << "ppwi: " << r.sample.ppwi << ", " << "wgsize: " << r.sample.wgsize << " }\n" << prefix << " raw_iterations: [" << mk_string(iterationTimesMs) << "]\n" - << prefix << " context_ms: " << contextMs << "\n" + << prefix << " host_to_device_ms: " << hostToDeviceMs << "\n" + << prefix << " device_to_host_ms: " << deviceToHostMs << "\n" << prefix << " sum_ms: " << r.ms.sum << "\n" << prefix << " avg_ms: " << r.ms.mean << "\n" << prefix << " min_ms: " << r.ms.min << "\n" @@ -464,12 +466,24 @@ void showHumanReadable(const Params &p, const Result &r, int indent = 1) { } void showCsv(const Params &p, const Result &r, bool header) { - if (header) std::cout << "ppwi,wgsize,sum_ms,avg_ms,min_ms,max_ms,stddev_ms,interactions/s,gflops/s,gfinst/s\n"; - std::cout.precision(3); - std::cout << std::fixed; - std::cout << r.sample.ppwi << "," << r.sample.wgsize // + std::fstream out(p.csv_filename, std::ios::out | std::ios::trunc); + if (header) out << "ppwi,wgsize,sum_ms,avg_ms,min_ms,max_ms,stddev_ms,interactions/s,gflops/s,gfinst/s"// + << ",host_to_device_ms,device_to_host_ms\n"; + + auto hostToDeviceMs = r.sample.hostToDevice + ? std::to_string(elapsedMillis(r.sample.hostToDevice->first, r.sample.hostToDevice->second)) + : "~"; + auto deviceToHostMs = r.sample.deviceToHost + ? std::to_string(elapsedMillis(r.sample.deviceToHost->first, r.sample.deviceToHost->second)) + : "~"; + + out.precision(3); + out << std::fixed; + out << r.sample.ppwi << "," << r.sample.wgsize // << "," << r.ms.sum << "," << r.ms.mean << "," << r.ms.min << "," << r.ms.max << "," << r.ms.stdDev // - << "," << (r.interactionsPerSec) << "," << r.gflops << "," << r.ginsts << std::endl; + << "," << (r.interactionsPerSec) << "," << r.gflops << "," << r.ginsts // + << "," << hostToDeviceMs << "," << deviceToHostMs << std::endl; + out.close(); } template diff --git a/src/omp/fasten.hpp b/src/omp/fasten.hpp index cae2328..f20079b 100644 --- a/src/omp/fasten.hpp +++ b/src/omp/fasten.hpp @@ -181,56 +181,33 @@ template class IMPL_CLS final : public Bude { const auto natlig = p.natlig(); const auto natpro = p.natpro(); - std::array poses{}; - auto protein = static_cast(std::malloc(sizeof(Atom) * natpro)); - auto ligand = static_cast(std::malloc(sizeof(Atom) * natlig)); - auto forcefield = static_cast(std::malloc(sizeof(FFParams) * ntypes)); - auto energies = static_cast(std::malloc(sizeof(float) * nposes)); - - for (auto i = 0; i < 6; i++) - poses[i] = static_cast(std::malloc(sizeof(float) * nposes)); - -#pragma omp parallel - { - for (auto i = 0; i < 6; i++) { -#pragma omp for nowait - for (auto j = 0; j < nposes; j++) - poses[i][j] = p.poses[i][j]; - } -#pragma omp for nowait - for (auto i = 0; i < nposes; i++) - energies[i] = 0.f; - -#pragma omp for nowait - for (auto i = 0; i < natpro; i++) - protein[i] = p.protein[i]; - -#pragma omp for nowait - for (auto i = 0; i < natlig; i++) - ligand[i] = p.ligand[i]; - -#pragma omp for nowait - for (auto i = 0; i < ntypes; i++) - forcefield[i] = p.forcefield[i]; - } - auto contextEnd = now(); - sample.contextTime = {contextStart, contextEnd}; - - auto poses_0 = poses[0]; - auto poses_1 = poses[1]; - auto poses_2 = poses[2]; - auto poses_3 = poses[3]; - auto poses_4 = poses[4]; - auto poses_5 = poses[5]; + auto poses = p.poses.data(); + auto protein = p.protein.data(); + auto ligand = p.ligand.data(); + auto forcefield = p.forcefield.data(); + auto energies = sample.energies.data(); + + auto poses_0 = poses[0].data(); + auto poses_1 = poses[1].data(); + auto poses_2 = poses[2].data(); + auto poses_3 = poses[3].data(); + auto poses_4 = poses[4].data(); + auto poses_5 = poses[5].data(); #ifdef OMP_TARGET // clang-format off - #pragma omp target data \ - map(from: energies[:nposes]) \ + auto hostToDeviceStart = now(); + #pragma omp target enter data \ map(to: \ protein[:natpro], ligand[:natlig], forcefield[:ntypes], \ poses_0[:nposes], poses_1[:nposes], poses_2[:nposes], \ poses_3[:nposes], poses_4[:nposes], poses_5[:nposes] \ ) + + auto hostToDeviceEnd = now(); + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; + + #pragma omp target enter data \ + map(alloc: energies[:nposes]) #endif // OMP_TARGET clang-format on for (size_t i = 0; i < p.totalIterations(); ++i) { auto kernelStart = now(); @@ -253,13 +230,18 @@ template class IMPL_CLS final : public Bude { sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } - std::copy(energies, energies + p.nposes(), sample.energies.begin()); - std::free(protein); - std::free(ligand); - std::free(forcefield); - std::free(energies); - for (auto &pose : poses) - std::free(pose); + auto deviceToHostStart = now(); + #pragma omp target update from(energies[:nposes]) + + auto deviceToHostEnd = now(); + sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; + + #pragma omp target exit data \ + map(release: \ + protein[:natpro], ligand[:natlig], forcefield[:ntypes], \ + poses_0[:nposes], poses_1[:nposes], poses_2[:nposes], \ + poses_3[:nposes], poses_4[:nposes], poses_5[:nposes], \ + energies[:nposes]) return sample; }; diff --git a/src/raja/fasten.hpp b/src/raja/fasten.hpp index 39c32c4..c6f4573 100644 --- a/src/raja/fasten.hpp +++ b/src/raja/fasten.hpp @@ -308,7 +308,7 @@ template class IMPL_CLS final : public Bude { Sample sample(PPWI, wgsize, p.nposes()); auto &rm = umpire::ResourceManager::getInstance(); - auto contextStart = now(); + auto hostToDeviceStart = now(); auto protein = allocate(p.protein); auto ligand = allocate(p.ligand); @@ -319,16 +319,18 @@ template class IMPL_CLS final : public Bude { auto transforms_4 = allocate(p.poses[4]); auto transforms_5 = allocate(p.poses[5]); auto forcefield = allocate(p.forcefield); - auto results = allocate(sample.energies.size()); synchronise(); + auto hostToDeviceEnd = now(); + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; + auto host_energies = registerAllocation(sample.energies); + auto results = allocate(sample.energies.size()); + + synchronise(); - auto contextEnd = now(); - sample.contextTime = {contextStart, contextEnd}; - - for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) { + for (size_t i = 0; i < p.totalIterations(); ++i) { auto kernelStart = now(); fasten_main(device, wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), // protein, ligand, forcefield, // @@ -338,8 +340,12 @@ template class IMPL_CLS final : public Bude { sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } + auto deviceToHostStart = now(); rm.copy(host_energies, results); + auto deviceToHostEnd = now(); + sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; + deallocate(protein); deallocate(ligand); deallocate(transforms_0); diff --git a/src/sycl-acc/fasten.hpp b/src/sycl-acc/fasten.hpp index 02e34e3..1994b57 100644 --- a/src/sycl-acc/fasten.hpp +++ b/src/sycl-acc/fasten.hpp @@ -198,9 +198,9 @@ template class IMPL_CLS final : public Bude { auto device = devices[deviceIdx]; Sample sample(PPWI, wgsize, p.nposes()); - - auto contextStart = now(); sycl::queue queue(device); + + auto hostToDeviceStart = now(); sycl::buffer proteins(p.protein.data(), p.protein.size()); sycl::buffer ligands(p.ligand.data(), p.ligand.size()); sycl::buffer forcefields(p.forcefield.data(), p.forcefield.size()); @@ -212,8 +212,8 @@ template class IMPL_CLS final : public Bude { sycl::buffer transforms_5(p.poses[5].data(), p.poses[5].size()); sycl::buffer energies(sample.energies.size()); queue.wait_and_throw(); - auto contextEnd = now(); - sample.contextTime = {contextStart, contextEnd}; + auto hostToDeviceEnd = now(); + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) { auto kernelStart = now(); @@ -229,8 +229,12 @@ template class IMPL_CLS final : public Bude { sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } + auto deviceToHostStart = now(); queue.submit([&](sycl::handler &h) { h.copy(energies.get_access(h), sample.energies.data()); }); queue.wait_and_throw(); + auto deviceToHostEnd = now(); + + sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; return sample; }; diff --git a/src/sycl-acc/model.cmake b/src/sycl-acc/model.cmake index 6a517c1..65b5b52 100644 --- a/src/sycl-acc/model.cmake +++ b/src/sycl-acc/model.cmake @@ -1,4 +1,3 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" "c++") @@ -60,18 +59,12 @@ macro(setup) set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) elseif (${SYCL_COMPILER} STREQUAL "DPCPP") - set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) - include_directories(${SYCL_COMPILER_DIR}/include/sycl) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") - set(CMAKE_CXX_COMPILER icpx) - set(CMAKE_C_COMPILER icx) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") - set(CMAKE_CXX_COMPILER clang++) - set(CMAKE_C_COMPILER clang) register_append_cxx_flags(ANY -fsycl) register_append_link_flags(-fsycl) else () diff --git a/src/sycl-usm/fasten.hpp b/src/sycl-usm/fasten.hpp index 43455e6..0de512f 100644 --- a/src/sycl-usm/fasten.hpp +++ b/src/sycl-usm/fasten.hpp @@ -202,7 +202,8 @@ template class IMPL_CLS final : public Bude { Sample sample(PPWI, wgsize, p.nposes()); sycl::queue queue(device); - auto contextStart = now(); + auto hostToDeviceStart = now(); + auto proteins = allocate(p.protein, queue); auto ligands = allocate(p.ligand, queue); auto transforms_0 = allocate(p.poses[0], queue); @@ -212,12 +213,15 @@ template class IMPL_CLS final : public Bude { auto transforms_4 = allocate(p.poses[4], queue); auto transforms_5 = allocate(p.poses[5], queue); auto forcefields = allocate(p.forcefield, queue); + queue.wait_and_throw(); + + auto hostToDeviceEnd = now(); + sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd}; + auto energies = allocate(std::size(sample.energies), queue); queue.wait_and_throw(); - auto contextEnd = now(); - sample.contextTime = {contextStart, contextEnd}; - for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) { + for (size_t i = 0; i < p.totalIterations(); ++i) { auto kernelStart = now(); queue.submit([&](sycl::handler &h) { fasten_main(h, wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), @@ -231,9 +235,13 @@ template class IMPL_CLS final : public Bude { sample.kernelTimes.emplace_back(kernelStart, kernelEnd); } + auto deviceToHostStart = now(); queue.memcpy(std::data(sample.energies), energies, sizeof(float) * std::size(sample.energies)); queue.wait_and_throw(); + auto deviceToHostEnd = now(); + sample.deviceToHost = {hostToDeviceStart, hostToDeviceEnd}; + return sample; }; }; diff --git a/src/sycl-usm/model.cmake b/src/sycl-usm/model.cmake index dfef3e6..661347f 100644 --- a/src/sycl-usm/model.cmake +++ b/src/sycl-usm/model.cmake @@ -9,7 +9,7 @@ register_flag_required(SYCL_COMPILER ONEAPI-ICPX - icpx as a standalone compiler ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + HIPSYCL/ADAPTIVECPP - hipSYCL compiler (https://github.com/illuhad/hipSYCL) COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") register_flag_optional(SYCL_COMPILER_DIR @@ -47,7 +47,7 @@ macro(setup) find_package(hipSYCL CONFIG REQUIRED) message(STATUS "ok") - elseif (${SYCL_COMPILER} STREQUAL "ACPP") + elseif (${SYCL_COMPILER} STREQUAL "ADAPTIVECPP") find_package(AdaptiveCpp CONFIG REQUIRED) elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") @@ -94,7 +94,7 @@ macro(setup_target NAME) if ( (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR (${SYCL_COMPILER} STREQUAL "HIPSYCL") OR - (${SYCL_COMPILER} STREQUAL "ACPP")) + (${SYCL_COMPILER} STREQUAL "ADAPTIVECPP")) # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their # own custom integration header flags AFTER the target has been specified