Skip to content

Commit

Permalink
Merge pull request #5 from hpcgroup/timer-updates
Browse files Browse the repository at this point in the history
miniBUDE Timer Updates
  • Loading branch information
pranav-sivaraman authored Feb 16, 2024
2 parents 8fd4290 + 1c259f3 commit 0aca8e0
Show file tree
Hide file tree
Showing 14 changed files with 229 additions and 181 deletions.
129 changes: 64 additions & 65 deletions src/acc/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
std::vector<Device> devices;
acc_device_t device_type = acc_get_device_type();
for (int i = 0; i < acc_get_num_devices(device_type); ++i) {
devices.template emplace_back(i, "OMP target device #" + std::to_string(i));
devices.template emplace_back(i, "ACC target device #" + std::to_string(i));
}
return devices;
};
Expand All @@ -158,78 +158,77 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();

std::array<float *, 6> poses{};
auto protein = static_cast<Atom *>(std::malloc(sizeof(Atom) * p.natpro()));
auto ligand = static_cast<Atom *>(std::malloc(sizeof(Atom) * p.natlig()));
auto forcefield = static_cast<FFParams *>(std::malloc(sizeof(FFParams) * p.ntypes()));
auto energies = static_cast<float *>(std::malloc(sizeof(float) * p.nposes()));

for (size_t i = 0; i < 6; i++) {
poses[i] = static_cast<float *>(std::malloc(sizeof(float) * p.nposes()));
std::copy(p.poses[i].begin(), p.poses[i].end(), poses[i]);
}

std::copy(p.protein.begin(), p.protein.end(), protein);
std::copy(p.ligand.begin(), p.ligand.end(), ligand);
std::copy(p.forcefield.begin(), p.forcefield.end(), forcefield);
std::fill(energies, energies + p.nposes(), 0.0);

auto contextEnd = now();
sample.contextTime = {contextStart, contextEnd};

auto poses_0 = poses[0];
auto poses_1 = poses[1];
auto poses_2 = poses[2];
auto poses_3 = poses[3];
auto poses_4 = poses[4];
auto poses_5 = poses[5];
const auto ntypes = p.ntypes();
const auto nposes = p.nposes();
const auto natlig = p.natlig();
const auto natpro = p.natpro();

// clang-format off
#pragma acc data \
copyin( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes]) \
copyout(energies[:nposes])
{
auto poses = p.poses.data();
auto protein = p.protein.data();
auto ligand = p.ligand.data();
auto forcefield = p.forcefield.data();
auto energies = sample.energies.data();

auto poses_0 = poses[0].data();
auto poses_1 = poses[1].data();
auto poses_2 = poses[2].data();
auto poses_3 = poses[3].data();
auto poses_4 = poses[4].data();
auto poses_5 = poses[5].data();

// clang-format off
auto hostToDeviceStart = now();

#pragma acc enter data \
copyin( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes]) \

auto hostToDeviceEnd = now();
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

#pragma acc enter data \
create(energies[:nposes])

// clang-format on
for (size_t i = 0; i < p.totalIterations(); ++i) {
auto kernelStart = now();

// clang-format off
#pragma acc parallel loop \
present( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes], energies[:nposes])
// clang-format on
for (size_t i = 0; i < p.totalIterations(); ++i) {
auto kernelStart = now();

// clang-format off
#pragma acc parallel loop \
present( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes], energies[:nposes]) wait
// clang-format on
for (size_t group = 0; group < (nposes / PPWI); group++) {
fasten_main<PPWI>(group, ntypes, nposes, natlig, natpro, //
protein, ligand, //
poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, //
forcefield, energies);
}
auto kernelEnd = now();
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
for (size_t group = 0; group < (nposes / PPWI); group++) {
fasten_main<PPWI>(group, ntypes, nposes, natlig, natpro, //
protein, ligand, //
poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, //
forcefield, energies);
}
auto kernelEnd = now();
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}
std::copy(energies, energies + p.nposes(), sample.energies.begin());
std::free(protein);
std::free(ligand);
std::free(forcefield);
std::free(energies);
for (auto &pose : poses)
std::free(pose);

auto deviceToHostStart = now();

#pragma acc update \
host(energies[:nposes])

auto deviceToHostEnd = now();
sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

#pragma acc exit data \
delete(protein, ligand, forcefield, \
poses_0, poses_1, poses_2, \
poses_3, poses_4, poses_5, \
energies)

return sample;
};
Expand Down
9 changes: 9 additions & 0 deletions src/acc/model.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ register_flag_optional(TARGET_PROCESSOR
Refer to `nvc++ --help` for the full list"
"")

register_flag_optional(OFFLOAD_FLAGS
"OpenACC Offload Flags"
""
)

set(ACC_FLAGS_OFFLOAD_GNU -foffload=-lm)

macro(setup)
Expand Down Expand Up @@ -89,5 +94,9 @@ macro(setup)

endif ()

separate_arguments(OFFLOAD_FLAGS)
register_append_cxx_flags(ANY ${OFFLOAD_FLAGS})
register_append_link_flags(${OFFLOAD_FLAGS})

endmacro()

7 changes: 5 additions & 2 deletions src/bude.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#define DIFF_TOLERANCE_PCT 0.025f
#define DEFAULT_ITERS 8
#define DEFAULT_WARMUPS 2
#define DEFAULT_ENERGY_ENTRIES 8

#define DATA_DIR "../data/bm1"
Expand Down Expand Up @@ -61,6 +62,7 @@ struct Params {
size_t maxPoses, iterations, warmupIterations, outRows;
std::string deckDir, output, deviceSelector;
bool csv;
std::string csv_filename;

bool list;

Expand All @@ -85,9 +87,10 @@ struct Sample {
size_t ppwi, wgsize;
std::vector<float> energies;
std::vector<std::pair<TimePoint, TimePoint>> kernelTimes;
std::optional<std::pair<TimePoint, TimePoint>> contextTime;
std::optional<std::pair<TimePoint, TimePoint>> hostToDevice;
std::optional<std::pair<TimePoint, TimePoint>> deviceToHost;
Sample(size_t ppwi, size_t wgsize, size_t nposes)
: ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), contextTime() {}
: ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), hostToDevice(), deviceToHost() {}
};

using Device = std::pair<size_t, std::string>;
Expand Down
15 changes: 11 additions & 4 deletions src/cuda/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();
auto hostToDeviceStart = now();
auto protein = allocate(p.protein);
auto ligand = allocate(p.ligand);
auto transforms_0 = allocate(p.poses[0]);
Expand All @@ -263,12 +263,14 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
auto transforms_4 = allocate(p.poses[4]);
auto transforms_5 = allocate(p.poses[5]);
auto forcefield = allocate(p.forcefield);
auto results = allocate<float>(sample.energies.size());
checkError(cudaDeviceSynchronize());
auto contextEnd = now();
auto hostToDeviceEnd = now();

sample.contextTime = {contextStart, contextEnd};
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

auto results = allocate<float>(sample.energies.size());
checkError(cudaDeviceSynchronize());

size_t global = std::ceil(double(p.nposes()) / PPWI);
global = std::ceil(double(global) / double(wgsize));
size_t local = wgsize;
Expand All @@ -285,8 +287,13 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

auto deviceToHostStart = now();

checkError(
cudaMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), cudaMemcpyDeviceToHost));

auto deviceToHostEnd = now();
sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

free(protein);
free(ligand);
Expand Down
14 changes: 10 additions & 4 deletions src/hip/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();
auto hostToDeviceStart = now();
auto protein = allocate(p.protein);
auto ligand = allocate(p.ligand);
auto transforms_0 = allocate(p.poses[0]);
Expand All @@ -225,12 +225,14 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
auto transforms_4 = allocate(p.poses[4]);
auto transforms_5 = allocate(p.poses[5]);
auto forcefield = allocate(p.forcefield);
auto results = allocate<float>(sample.energies.size());
checkError(hipDeviceSynchronize());
auto contextEnd = now();
auto hostToDeviceEnd = now();

sample.contextTime = {contextStart, contextEnd};
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

auto results = allocate<float>(sample.energies.size());
checkError(hipDeviceSynchronize());

size_t global = std::ceil(double(p.nposes()) / PPWI);
global = std::ceil(double(global) / double(wgsize));
size_t local = wgsize;
Expand All @@ -247,8 +249,12 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

auto deviceToHostStart = now();
checkError(
hipMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), hipMemcpyDeviceToHost));
auto deviceToHostEnd = now();

sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

free(protein);
free(ligand);
Expand Down
28 changes: 18 additions & 10 deletions src/kokkos/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,9 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
}

template <typename T> static Kokkos::View<T *> mkView(const std::string &name, const std::vector<T> &xs) {
Kokkos::View<T *> view(name, xs.size());
auto mirror = Kokkos::create_mirror_view(view);
for (size_t i = 0; i < xs.size(); i++)
mirror[i] = xs[i];
Kokkos::View<const T *, Kokkos::LayoutLeft, Kokkos::HostSpace,
Kokkos::MemoryTraits<Kokkos::Unmanaged>> mirror (std::data(xs), std::size(xs));
Kokkos::View<T *> view (name, std::size(xs));
Kokkos::deep_copy(view, mirror);
return view;
}
Expand All @@ -195,10 +194,10 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
if (!Kokkos::is_initialized()) {
Kokkos::initialize();
}

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();
auto hostToDeviceStart = now();

auto protein = mkView("protein", p.protein);
auto ligand = mkView("ligand", p.ligand);
Expand All @@ -209,12 +208,15 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
auto transforms_4 = mkView("transforms_4", p.poses[4]);
auto transforms_5 = mkView("transforms_5", p.poses[5]);
auto forcefield = mkView("forcefield", p.forcefield);
Kokkos::View<float *> results("results", sample.energies.size());
Kokkos::fence();
auto contextEnd = now();
sample.contextTime = {contextStart, contextEnd};

for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
auto hostToDeviceEnd = now();
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

Kokkos::View<float *> results(Kokkos::ViewAllocateWithoutInitializing("results"), sample.energies.size());
Kokkos::fence();

for (size_t i = 0; i < p.totalIterations(); ++i) {
auto kernelStart = now();
fasten_main(wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), //
protein, ligand, forcefield, //
Expand All @@ -224,8 +226,14 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

auto deviceToHostStart = now();

auto result_mirror = Kokkos::create_mirror_view(results);
Kokkos::deep_copy(result_mirror, results);

auto deviceToHostEnd = now();
sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

for (size_t i = 0; i < results.size(); i++) {
sample.energies[i] = result_mirror[i];
}
Expand Down
21 changes: 15 additions & 6 deletions src/kokkos/model.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ register_flag_optional(CMAKE_CXX_COMPILER
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
"c++")

register_flag_required(KOKKOS_IN_TREE
register_flag_optional(KOKKOS_IN_TREE
"Absolute path to the *source* distribution directory of Kokkos.
Remember to append Kokkos specific flags as well, for example:
-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")

See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
register_flag_optional(KOKKOS_IN_PACKAGE
"Absolute path to package R-Path containing Kokkos libs.
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")

# compiler vendor and arch specific flags
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
Expand All @@ -19,15 +21,22 @@ macro(setup)

cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)

message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")

if (EXISTS "${KOKKOS_IN_TREE}")
if (KOKKOS_IN_TREE)
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
register_link_library(Kokkos::kokkos)
elseif (KOKKOS_IN_PACKAGE)
message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
find_package(Kokkos REQUIRED)
register_link_library(Kokkos::kokkos)
else ()
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
endif ()

register_append_compiler_and_arch_specific_cxx_flags(
KOKKOS_FLAGS_CPU
${CMAKE_CXX_COMPILER_ID}
Expand Down
Loading

0 comments on commit 0aca8e0

Please sign in to comment.