diff --git a/src/acc/fasten.hpp b/src/acc/fasten.hpp
index c40a38f..35f3494 100644
--- a/src/acc/fasten.hpp
+++ b/src/acc/fasten.hpp
@@ -146,7 +146,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     std::vector<Device> devices;
     acc_device_t device_type = acc_get_device_type();
     for (int i = 0; i < acc_get_num_devices(device_type); ++i) {
-      devices.template emplace_back(i, "OMP target device #" + std::to_string(i));
+      devices.template emplace_back(i, "ACC target device #" + std::to_string(i));
     }
     return devices;
   };
@@ -158,78 +158,77 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
 
     Sample sample(PPWI, wgsize, p.nposes());
 
-    auto contextStart = now();
-
-    std::array<float *, 6> poses{};
-    auto protein = static_cast<Atom *>(std::malloc(sizeof(Atom) * p.natpro()));
-    auto ligand = static_cast<Atom *>(std::malloc(sizeof(Atom) * p.natlig()));
-    auto forcefield = static_cast<FFParams *>(std::malloc(sizeof(FFParams) * p.ntypes()));
-    auto energies = static_cast<float *>(std::malloc(sizeof(float) * p.nposes()));
-
-    for (size_t i = 0; i < 6; i++) {
-      poses[i] = static_cast<float *>(std::malloc(sizeof(float) * p.nposes()));
-      std::copy(p.poses[i].begin(), p.poses[i].end(), poses[i]);
-    }
-
-    std::copy(p.protein.begin(), p.protein.end(), protein);
-    std::copy(p.ligand.begin(), p.ligand.end(), ligand);
-    std::copy(p.forcefield.begin(), p.forcefield.end(), forcefield);
-    std::fill(energies, energies + p.nposes(), 0.0);
-
-    auto contextEnd = now();
-    sample.contextTime = {contextStart, contextEnd};
-
-    auto poses_0 = poses[0];
-    auto poses_1 = poses[1];
-    auto poses_2 = poses[2];
-    auto poses_3 = poses[3];
-    auto poses_4 = poses[4];
-    auto poses_5 = poses[5];
     const auto ntypes = p.ntypes();
     const auto nposes = p.nposes();
     const auto natlig = p.natlig();
     const auto natpro = p.natpro();
 
-// clang-format off
-#pragma acc data                                    \
-        copyin(                                     \
-            protein[:natpro], ligand[:natlig],      \
-            forcefield[:ntypes], poses_0[:nposes],  \
-            poses_1[:nposes], poses_2[:nposes],     \
-            poses_3[:nposes], poses_4[:nposes],     \
-            poses_5[:nposes])                       \
-        copyout(energies[:nposes])
-{
+    auto poses = p.poses.data();  
+    auto protein = p.protein.data();
+    auto ligand = p.ligand.data();
+    auto forcefield = p.forcefield.data();
+    auto energies = sample.energies.data();
+
+    auto poses_0 = poses[0].data();
+    auto poses_1 = poses[1].data();
+    auto poses_2 = poses[2].data();
+    auto poses_3 = poses[3].data();
+    auto poses_4 = poses[4].data();
+    auto poses_5 = poses[5].data();
+
+    // clang-format off
+    auto hostToDeviceStart = now();
+        
+  #pragma acc enter data                      \
+    copyin(                                   \
+      protein[:natpro], ligand[:natlig],      \
+      forcefield[:ntypes], poses_0[:nposes],  \
+      poses_1[:nposes], poses_2[:nposes],     \
+      poses_3[:nposes], poses_4[:nposes],     \
+      poses_5[:nposes])                       \
+
+    auto hostToDeviceEnd = now();
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
+    
+  #pragma acc enter data                      \
+    create(energies[:nposes])
+
+    // clang-format on
+    for (size_t i = 0; i < p.totalIterations(); ++i) {
+      auto kernelStart = now();
+
+    // clang-format off
+  #pragma acc parallel loop                   \
+    present(                                  \
+      protein[:natpro], ligand[:natlig],      \
+      forcefield[:ntypes], poses_0[:nposes],  \
+      poses_1[:nposes], poses_2[:nposes],     \
+      poses_3[:nposes], poses_4[:nposes],     \
+      poses_5[:nposes], energies[:nposes])
       // clang-format on
-      for (size_t i = 0; i < p.totalIterations(); ++i) {
-        auto kernelStart = now();
-
-// clang-format off
-#pragma acc parallel loop  \
-        present(                                    \
-            protein[:natpro], ligand[:natlig],      \
-            forcefield[:ntypes], poses_0[:nposes],  \
-            poses_1[:nposes], poses_2[:nposes],     \
-            poses_3[:nposes], poses_4[:nposes],     \
-            poses_5[:nposes], energies[:nposes]) wait
-        // clang-format on
-        for (size_t group = 0; group < (nposes / PPWI); group++) {
-          fasten_main<PPWI>(group, ntypes, nposes, natlig, natpro,                //
-                            protein, ligand,                                      //
-                            poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, //
-                            forcefield, energies);
-        }
-        auto kernelEnd = now();
-        sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
+      for (size_t group = 0; group < (nposes / PPWI); group++) {
+        fasten_main<PPWI>(group, ntypes, nposes, natlig, natpro,                //
+                          protein, ligand,                                      //
+                          poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, //
+                          forcefield, energies);
       }
+      auto kernelEnd = now();
+      sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
-    std::copy(energies, energies + p.nposes(), sample.energies.begin());
-    std::free(protein);
-    std::free(ligand);
-    std::free(forcefield);
-    std::free(energies);
-    for (auto &pose : poses)
-      std::free(pose);
+
+  auto deviceToHostStart = now();
+        
+  #pragma acc update                          \
+    host(energies[:nposes])
+        
+  auto deviceToHostEnd = now();
+  sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};
+  
+  #pragma acc exit data                       \
+    delete(protein, ligand, forcefield,       \
+           poses_0, poses_1, poses_2,         \
+           poses_3, poses_4, poses_5,         \
+           energies)
 
     return sample;
   };
diff --git a/src/acc/model.cmake b/src/acc/model.cmake
index a85ea03..c40b36c 100644
--- a/src/acc/model.cmake
+++ b/src/acc/model.cmake
@@ -43,6 +43,11 @@ register_flag_optional(TARGET_PROCESSOR
         Refer to `nvc++ --help` for the full list"
         "")
 
+register_flag_optional(OFFLOAD_FLAGS
+   "OpenACC Offload Flags"
+   ""
+)
+
 set(ACC_FLAGS_OFFLOAD_GNU -foffload=-lm)
 
 macro(setup)
@@ -89,5 +94,9 @@ macro(setup)
 
     endif ()
 
+    separate_arguments(OFFLOAD_FLAGS)
+    register_append_cxx_flags(ANY ${OFFLOAD_FLAGS})
+    register_append_link_flags(${OFFLOAD_FLAGS})
+
 endmacro()
 
diff --git a/src/bude.h b/src/bude.h
index c2b0d91..190c656 100644
--- a/src/bude.h
+++ b/src/bude.h
@@ -10,6 +10,7 @@
 
 #define DIFF_TOLERANCE_PCT 0.025f
 #define DEFAULT_ITERS 8
+#define DEFAULT_WARMUPS 2
 #define DEFAULT_ENERGY_ENTRIES 8
 
 #define DATA_DIR "../data/bm1"
@@ -61,6 +62,7 @@ struct Params {
   size_t maxPoses, iterations, warmupIterations, outRows;
   std::string deckDir, output, deviceSelector;
   bool csv;
+  std::string csv_filename;
 
   bool list;
 
@@ -85,9 +87,10 @@ struct Sample {
   size_t ppwi, wgsize;
   std::vector<float> energies;
   std::vector<std::pair<TimePoint, TimePoint>> kernelTimes;
-  std::optional<std::pair<TimePoint, TimePoint>> contextTime;
+  std::optional<std::pair<TimePoint, TimePoint>> hostToDevice;
+  std::optional<std::pair<TimePoint, TimePoint>> deviceToHost;
   Sample(size_t ppwi, size_t wgsize, size_t nposes)
-      : ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), contextTime() {}
+      : ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), hostToDevice(), deviceToHost() {}
 };
 
 using Device = std::pair<size_t, std::string>;
diff --git a/src/cuda/fasten.hpp b/src/cuda/fasten.hpp
index 39927a4..8707b53 100644
--- a/src/cuda/fasten.hpp
+++ b/src/cuda/fasten.hpp
@@ -253,7 +253,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
 
     Sample sample(PPWI, wgsize, p.nposes());
 
-    auto contextStart = now();
+    auto hostToDeviceStart = now();
     auto protein = allocate(p.protein);
     auto ligand = allocate(p.ligand);
     auto transforms_0 = allocate(p.poses[0]);
@@ -263,12 +263,14 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     auto transforms_4 = allocate(p.poses[4]);
     auto transforms_5 = allocate(p.poses[5]);
     auto forcefield = allocate(p.forcefield);
-    auto results = allocate<float>(sample.energies.size());
     checkError(cudaDeviceSynchronize());
-    auto contextEnd = now();
+    auto hostToDeviceEnd = now();
 
-    sample.contextTime = {contextStart, contextEnd};
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
 
+    auto results = allocate<float>(sample.energies.size());
+    checkError(cudaDeviceSynchronize());
+    
     size_t global = std::ceil(double(p.nposes()) / PPWI);
     global = std::ceil(double(global) / double(wgsize));
     size_t local = wgsize;
@@ -285,8 +287,13 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
+    auto deviceToHostStart = now(); 
+    
     checkError(
         cudaMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), cudaMemcpyDeviceToHost));
+    
+    auto deviceToHostEnd = now(); 
+    sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};
 
     free(protein);
     free(ligand);
diff --git a/src/hip/fasten.hpp b/src/hip/fasten.hpp
index a3c5973..a77ceb5 100644
--- a/src/hip/fasten.hpp
+++ b/src/hip/fasten.hpp
@@ -215,7 +215,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
 
     Sample sample(PPWI, wgsize, p.nposes());
 
-    auto contextStart = now();
+    auto hostToDeviceStart = now();
     auto protein = allocate(p.protein);
     auto ligand = allocate(p.ligand);
     auto transforms_0 = allocate(p.poses[0]);
@@ -225,12 +225,14 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     auto transforms_4 = allocate(p.poses[4]);
     auto transforms_5 = allocate(p.poses[5]);
     auto forcefield = allocate(p.forcefield);
-    auto results = allocate<float>(sample.energies.size());
     checkError(hipDeviceSynchronize());
-    auto contextEnd = now();
+    auto hostToDeviceEnd = now();
 
-    sample.contextTime = {contextStart, contextEnd};
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
 
+    auto results = allocate<float>(sample.energies.size());
+    checkError(hipDeviceSynchronize());
+    
     size_t global = std::ceil(double(p.nposes()) / PPWI);
     global = std::ceil(double(global) / double(wgsize));
     size_t local = wgsize;
@@ -247,8 +249,12 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
+    auto deviceToHostStart = now();
     checkError(
         hipMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), hipMemcpyDeviceToHost));
+    auto deviceToHostEnd = now();
+
+    sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};
 
     free(protein);
     free(ligand);
diff --git a/src/kokkos/fasten.hpp b/src/kokkos/fasten.hpp
index 1bd4ffc..3772f34 100644
--- a/src/kokkos/fasten.hpp
+++ b/src/kokkos/fasten.hpp
@@ -172,10 +172,9 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
   }
 
   template <typename T> static Kokkos::View<T *> mkView(const std::string &name, const std::vector<T> &xs) {
-    Kokkos::View<T *> view(name, xs.size());
-    auto mirror = Kokkos::create_mirror_view(view);
-    for (size_t i = 0; i < xs.size(); i++)
-      mirror[i] = xs[i];
+    Kokkos::View<const T *, Kokkos::LayoutLeft, Kokkos::HostSpace,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>> mirror (std::data(xs), std::size(xs));
+    Kokkos::View<T *> view (name, std::size(xs));
     Kokkos::deep_copy(view, mirror);
     return view;
   }
@@ -195,10 +194,10 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     if (!Kokkos::is_initialized()) {
       Kokkos::initialize();
     }
-
+    
     Sample sample(PPWI, wgsize, p.nposes());
 
-    auto contextStart = now();
+    auto hostToDeviceStart = now();
 
     auto protein = mkView("protein", p.protein);
     auto ligand = mkView("ligand", p.ligand);
@@ -209,12 +208,15 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     auto transforms_4 = mkView("transforms_4", p.poses[4]);
     auto transforms_5 = mkView("transforms_5", p.poses[5]);
     auto forcefield = mkView("forcefield", p.forcefield);
-    Kokkos::View<float *> results("results", sample.energies.size());
     Kokkos::fence();
-    auto contextEnd = now();
-    sample.contextTime = {contextStart, contextEnd};
 
-    for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
+    auto hostToDeviceEnd = now();
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
+    
+    Kokkos::View<float *> results(Kokkos::ViewAllocateWithoutInitializing("results"), sample.energies.size());
+    Kokkos::fence();
+
+    for (size_t i = 0; i < p.totalIterations(); ++i) {
       auto kernelStart = now();
       fasten_main(wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), //
                   protein, ligand, forcefield,                            //
@@ -224,8 +226,14 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
+    auto deviceToHostStart = now();
+
     auto result_mirror = Kokkos::create_mirror_view(results);
     Kokkos::deep_copy(result_mirror, results);
+
+    auto deviceToHostEnd = now();
+    sample.deviceToHost = {deviceToHostStart, deviceToHostEnd}; 
+
     for (size_t i = 0; i < results.size(); i++) {
       sample.energies[i] = result_mirror[i];
     }
diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake
index e1a71f8..a38a734 100644
--- a/src/kokkos/model.cmake
+++ b/src/kokkos/model.cmake
@@ -4,13 +4,15 @@ register_flag_optional(CMAKE_CXX_COMPILER
          See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
         "c++")
 
-register_flag_required(KOKKOS_IN_TREE
+register_flag_optional(KOKKOS_IN_TREE
         "Absolute path to the *source* distribution directory of Kokkos.
          Remember to append Kokkos specific flags as well, for example:
-
              -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
+         See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
 
-         See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
+register_flag_optional(KOKKOS_IN_PACKAGE
+        "Absolute path to package R-Path containing Kokkos libs.
+         Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
 
 # compiler vendor and arch specific flags
 set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
@@ -19,15 +21,22 @@ macro(setup)
 
     cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
 
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+
     message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
 
-    if (EXISTS "${KOKKOS_IN_TREE}")
+    if (KOKKOS_IN_TREE)
         add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
         register_link_library(Kokkos::kokkos)
+    elseif (KOKKOS_IN_PACKAGE)
+        message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
+        find_package(Kokkos REQUIRED)
+        register_link_library(Kokkos::kokkos)
     else ()
-        message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
+        message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
     endif ()
-
+  
     register_append_compiler_and_arch_specific_cxx_flags(
             KOKKOS_FLAGS_CPU
             ${CMAKE_CXX_COMPILER_ID}
diff --git a/src/main.cpp b/src/main.cpp
index 58c62b9..87de77c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -157,7 +157,7 @@ parseParams(const std::vector<std::string> &args) {
 
   // Defaults
   params.iterations = DEFAULT_ITERS;
-  params.warmupIterations = 2;
+  params.warmupIterations = DEFAULT_WARMUPS;
   params.deckDir = DATA_DIR;
   params.outRows = DEFAULT_ENERGY_ENTRIES;
 
@@ -212,10 +212,12 @@ parseParams(const std::vector<std::string> &args) {
     using namespace std::placeholders;
     const auto arg = args[i];
     if (read(i, arg, {"--iter", "-i"}, [&](auto &&s) { return bindInt(s, params.iterations, "iter"); })) continue;
+    if (read(i, arg, {"--warmups", "-w"}, [&](auto &&s) { return bindInt(s, params.warmupIterations, "iter"); })) continue;
     if (read(i, arg, {"--poses", "-n"}, [&](auto &&s) { bindInt(s, nposes, "poses"); })) continue;
     if (read(i, arg, {"--device", "-d"}, [&](auto &&s) { params.deviceSelector = s; })) continue;
     if (read(i, arg, {"--deck"}, [&](auto &&s) { params.deckDir = s; })) continue;
     if (read(i, arg, {"--out", "-o"}, [&](auto &&s) { params.output = s; })) continue;
+    if (read(i, arg, {"--csv"}, [&](auto &&s) { params.csv = true; params.csv_filename = s; })) continue;
     if (read(i, arg, {"--rows", "-r"}, [&](auto &&s) { return bindInt(s, params.outRows, "rows"); })) continue;
     if (read(i, arg, {"--wgsize", "-w"}, [&](auto &&s) { bindInts(s, wgsizes, "wgsize"); })) continue;
     if (read(i, arg, {"--ppwi", "-p"}, [&](auto &&s) {
@@ -232,11 +234,6 @@ parseParams(const std::vector<std::string> &args) {
         }))
       continue;
 
-    if (arg == "--csv") {
-      params.csv = true;
-      continue;
-    }
-
     if (arg == "list" || arg == "--list" || arg == "-l") {
       params.list = true;
       continue;
@@ -255,6 +252,8 @@ parseParams(const std::vector<std::string> &args) {
              "                       [optional] default=0\n"
           << "  -i --iter    I       Repeat kernel I times\n"
              "                       [optional] default=" << DEFAULT_ITERS << "\n"
+          << "  -w --warmups W       # of warmup iterations\n"
+             "                       [optional] default=" << DEFAULT_WARMUPS << "\n"
           << "  -n --poses   N       Compute energies for only N poses, use 0 for deck max\n"
              "                       [optional] default=0 \n"
           << "  -p --ppwi    PPWI    A CSV list of poses per work-item for the kernel, use `all` for everything\n"
@@ -267,7 +266,7 @@ parseParams(const std::vector<std::string> &args) {
              "                       [optional]\n"
           << "  -r --rows    N       Output first N row(s) of energy values as part of the on-screen result\n"
              "                       [optional] default=" << DEFAULT_ENERGY_ENTRIES << "\n"
-          << "     --csv             Output results in CSV format\n"
+          << "     --csv     PATH    Output results in CSV format\n"
              "                       [optional] default=false"
 
           << std::endl;
@@ -429,10 +428,12 @@ void showHumanReadable(const Params &p, const Result &r, int indent = 1) {
   std::string prefix(indent, ' ');
   std::cout.precision(3);
 
-  auto contextMs = r.sample.contextTime
-                       ? std::to_string(elapsedMillis(r.sample.contextTime->first, r.sample.contextTime->second))
+  auto hostToDeviceMs = r.sample.hostToDevice
+                       ? std::to_string(elapsedMillis(r.sample.hostToDevice->first, r.sample.hostToDevice->second))
+                       : "~";
+  auto deviceToHostMs = r.sample.deviceToHost
+                       ? std::to_string(elapsedMillis(r.sample.deviceToHost->first, r.sample.deviceToHost->second))
                        : "~";
-
   std::vector<double> iterationTimesMs;
   std::transform(r.sample.kernelTimes.begin(), //
                  r.sample.kernelTimes.end(), std::back_inserter(iterationTimesMs),
@@ -446,7 +447,8 @@ void showHumanReadable(const Params &p, const Result &r, int indent = 1) {
             << "ppwi: " << r.sample.ppwi << ", "
             << "wgsize: " << r.sample.wgsize << " }\n"
             << prefix << "   raw_iterations:      [" << mk_string(iterationTimesMs) << "]\n"
-            << prefix << "   context_ms:          " << contextMs << "\n"
+            << prefix << "   host_to_device_ms:          " << hostToDeviceMs << "\n"
+            << prefix << "   device_to_host_ms:          " << deviceToHostMs << "\n"
             << prefix << "   sum_ms:              " << r.ms.sum << "\n"
             << prefix << "   avg_ms:              " << r.ms.mean << "\n"
             << prefix << "   min_ms:              " << r.ms.min << "\n"
@@ -464,12 +466,24 @@ void showHumanReadable(const Params &p, const Result &r, int indent = 1) {
 }
 
 void showCsv(const Params &p, const Result &r, bool header) {
-  if (header) std::cout << "ppwi,wgsize,sum_ms,avg_ms,min_ms,max_ms,stddev_ms,interactions/s,gflops/s,gfinst/s\n";
-  std::cout.precision(3);
-  std::cout << std::fixed;
-  std::cout << r.sample.ppwi << "," << r.sample.wgsize                                                         //
+  std::fstream out(p.csv_filename, std::ios::out | std::ios::trunc);
+  if (header) out << "ppwi,wgsize,sum_ms,avg_ms,min_ms,max_ms,stddev_ms,interactions/s,gflops/s,gfinst/s"//
+                        << ",host_to_device_ms,device_to_host_ms\n";
+  
+  auto hostToDeviceMs = r.sample.hostToDevice
+                       ? std::to_string(elapsedMillis(r.sample.hostToDevice->first, r.sample.hostToDevice->second))
+                       : "~";
+  auto deviceToHostMs = r.sample.deviceToHost
+                       ? std::to_string(elapsedMillis(r.sample.deviceToHost->first, r.sample.deviceToHost->second))
+                       : "~";
+    
+  out.precision(3);
+  out << std::fixed;
+  out << r.sample.ppwi << "," << r.sample.wgsize                                                         //
             << "," << r.ms.sum << "," << r.ms.mean << "," << r.ms.min << "," << r.ms.max << "," << r.ms.stdDev //
-            << "," << (r.interactionsPerSec) << "," << r.gflops << "," << r.ginsts << std::endl;
+            << "," << (r.interactionsPerSec) << "," << r.gflops << "," << r.ginsts                             //
+            << "," << hostToDeviceMs << "," << deviceToHostMs << std::endl;
+  out.close();
 }
 
 template <size_t... Ns>
diff --git a/src/omp/fasten.hpp b/src/omp/fasten.hpp
index cae2328..f20079b 100644
--- a/src/omp/fasten.hpp
+++ b/src/omp/fasten.hpp
@@ -181,56 +181,33 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     const auto natlig = p.natlig();
     const auto natpro = p.natpro();
 
-    std::array<float *, 6> poses{};
-    auto protein = static_cast<Atom *>(std::malloc(sizeof(Atom) * natpro));
-    auto ligand = static_cast<Atom *>(std::malloc(sizeof(Atom) * natlig));
-    auto forcefield = static_cast<FFParams *>(std::malloc(sizeof(FFParams) * ntypes));
-    auto energies = static_cast<float *>(std::malloc(sizeof(float) * nposes));
-
-    for (auto i = 0; i < 6; i++)
-      poses[i] = static_cast<float *>(std::malloc(sizeof(float) * nposes));
-
-#pragma omp parallel
-    {
-      for (auto i = 0; i < 6; i++) {
-#pragma omp for nowait
-        for (auto j = 0; j < nposes; j++)
-          poses[i][j] = p.poses[i][j];
-      }
-#pragma omp for nowait
-      for (auto i = 0; i < nposes; i++)
-        energies[i] = 0.f;
-
-#pragma omp for nowait
-      for (auto i = 0; i < natpro; i++)
-        protein[i] = p.protein[i];
-
-#pragma omp for nowait
-      for (auto i = 0; i < natlig; i++)
-        ligand[i] = p.ligand[i];
-
-#pragma omp for nowait
-      for (auto i = 0; i < ntypes; i++)
-        forcefield[i] = p.forcefield[i];
-    }
-    auto contextEnd = now();
-    sample.contextTime = {contextStart, contextEnd};
-
-    auto poses_0 = poses[0];
-    auto poses_1 = poses[1];
-    auto poses_2 = poses[2];
-    auto poses_3 = poses[3];
-    auto poses_4 = poses[4];
-    auto poses_5 = poses[5];
+    auto poses = p.poses.data();  
+    auto protein = p.protein.data();
+    auto ligand = p.ligand.data();
+    auto forcefield = p.forcefield.data();
+    auto energies = sample.energies.data();
+
+    auto poses_0 = poses[0].data();
+    auto poses_1 = poses[1].data();
+    auto poses_2 = poses[2].data();
+    auto poses_3 = poses[3].data();
+    auto poses_4 = poses[4].data();
+    auto poses_5 = poses[5].data();   
 
 #ifdef OMP_TARGET // clang-format off
-  #pragma omp target data                                      \
-    map(from: energies[:nposes])                               \
+    auto hostToDeviceStart = now();
+  #pragma omp target enter data                                \
     map(to:                                                    \
       protein[:natpro], ligand[:natlig], forcefield[:ntypes],  \
       poses_0[:nposes], poses_1[:nposes], poses_2[:nposes],    \
       poses_3[:nposes], poses_4[:nposes], poses_5[:nposes]     \
     )
+        
+    auto hostToDeviceEnd = now();
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
+
+  #pragma omp target enter data                                \
+    map(alloc: energies[:nposes])                              
 #endif // OMP_TARGET clang-format on
     for (size_t i = 0; i < p.totalIterations(); ++i) {
       auto kernelStart = now();
@@ -253,13 +230,18 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
-    std::copy(energies, energies + p.nposes(), sample.energies.begin());
-    std::free(protein);
-    std::free(ligand);
-    std::free(forcefield);
-    std::free(energies);
-    for (auto &pose : poses)
-      std::free(pose);
+    auto deviceToHostStart = now();
+  #pragma omp target update from(energies[:nposes])
+
+    auto deviceToHostEnd = now();
+    sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};
+
+  #pragma omp target exit data                                 \
+    map(release:                                               \
+      protein[:natpro], ligand[:natlig], forcefield[:ntypes],  \
+      poses_0[:nposes], poses_1[:nposes], poses_2[:nposes],    \
+      poses_3[:nposes], poses_4[:nposes], poses_5[:nposes],    \
+      energies[:nposes])
 
     return sample;
   };
diff --git a/src/raja/fasten.hpp b/src/raja/fasten.hpp
index 39c32c4..c6f4573 100644
--- a/src/raja/fasten.hpp
+++ b/src/raja/fasten.hpp
@@ -308,7 +308,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     Sample sample(PPWI, wgsize, p.nposes());
     auto &rm = umpire::ResourceManager::getInstance();
 
-    auto contextStart = now();
+    auto hostToDeviceStart = now();
 
     auto protein = allocate(p.protein);
     auto ligand = allocate(p.ligand);
@@ -319,16 +319,18 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     auto transforms_4 = allocate(p.poses[4]);
     auto transforms_5 = allocate(p.poses[5]);
     auto forcefield = allocate(p.forcefield);
-    auto results = allocate<float>(sample.energies.size());
 
     synchronise();
 
+    auto hostToDeviceEnd = now();
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
+    
     auto host_energies = registerAllocation(sample.energies);
+    auto results = allocate<float>(sample.energies.size());
+    
+    synchronise();
 
-    auto contextEnd = now();
-    sample.contextTime = {contextStart, contextEnd};
-
-    for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
+    for (size_t i = 0; i < p.totalIterations(); ++i) {
       auto kernelStart = now();
       fasten_main(device, wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), //
                   protein, ligand, forcefield,                                    //
@@ -338,8 +340,12 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
+    auto deviceToHostStart = now();
     rm.copy(host_energies, results);
 
+    auto deviceToHostEnd = now();
+    sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};
+
     deallocate(protein);
     deallocate(ligand);
     deallocate(transforms_0);
diff --git a/src/sycl-acc/fasten.hpp b/src/sycl-acc/fasten.hpp
index 02e34e3..1994b57 100644
--- a/src/sycl-acc/fasten.hpp
+++ b/src/sycl-acc/fasten.hpp
@@ -198,9 +198,9 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     auto device = devices[deviceIdx];
 
     Sample sample(PPWI, wgsize, p.nposes());
-
-    auto contextStart = now();
     sycl::queue queue(device);
+        
+    auto hostToDeviceStart = now();
     sycl::buffer<Atom> proteins(p.protein.data(), p.protein.size());
     sycl::buffer<Atom> ligands(p.ligand.data(), p.ligand.size());
     sycl::buffer<FFParams> forcefields(p.forcefield.data(), p.forcefield.size());
@@ -212,8 +212,8 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     sycl::buffer<float> transforms_5(p.poses[5].data(), p.poses[5].size());
     sycl::buffer<float> energies(sample.energies.size());
     queue.wait_and_throw();
-    auto contextEnd = now();
-    sample.contextTime = {contextStart, contextEnd};
+    auto hostToDeviceEnd = now();
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
 
     for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
       auto kernelStart = now();
@@ -229,8 +229,12 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
+    auto deviceToHostStart = now();
     queue.submit([&](sycl::handler &h) { h.copy(energies.get_access<R>(h), sample.energies.data()); });
     queue.wait_and_throw();
+    auto deviceToHostEnd = now();
+        
+    sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};
 
     return sample;
   };
diff --git a/src/sycl-acc/model.cmake b/src/sycl-acc/model.cmake
index 6a517c1..65b5b52 100644
--- a/src/sycl-acc/model.cmake
+++ b/src/sycl-acc/model.cmake
@@ -1,4 +1,3 @@
-
 register_flag_optional(CMAKE_CXX_COMPILER
         "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler"
         "c++")
@@ -60,18 +59,12 @@ macro(setup)
         set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop)
 
     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
-        set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
-        include_directories(${SYCL_COMPILER_DIR}/include/sycl)
         register_append_cxx_flags(ANY -fsycl)
         register_append_link_flags(-fsycl)
     elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
-        set(CMAKE_CXX_COMPILER icpx)
-        set(CMAKE_C_COMPILER icx)
         register_append_cxx_flags(ANY -fsycl)
         register_append_link_flags(-fsycl)
     elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang")
-        set(CMAKE_CXX_COMPILER clang++)
-        set(CMAKE_C_COMPILER clang)
         register_append_cxx_flags(ANY -fsycl)
         register_append_link_flags(-fsycl)
     else ()
diff --git a/src/sycl-usm/fasten.hpp b/src/sycl-usm/fasten.hpp
index 43455e6..0de512f 100644
--- a/src/sycl-usm/fasten.hpp
+++ b/src/sycl-usm/fasten.hpp
@@ -202,7 +202,8 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     Sample sample(PPWI, wgsize, p.nposes());
     sycl::queue queue(device);
 
-    auto contextStart = now();
+    auto hostToDeviceStart = now();
+    
     auto proteins = allocate(p.protein, queue);
     auto ligands = allocate(p.ligand, queue);
     auto transforms_0 = allocate(p.poses[0], queue);
@@ -212,12 +213,15 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     auto transforms_4 = allocate(p.poses[4], queue);
     auto transforms_5 = allocate(p.poses[5], queue);
     auto forcefields = allocate(p.forcefield, queue);
+    queue.wait_and_throw();
+    
+    auto hostToDeviceEnd = now();
+    sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};
+    
     auto energies = allocate<float>(std::size(sample.energies), queue);
     queue.wait_and_throw();
-    auto contextEnd = now();
-    sample.contextTime = {contextStart, contextEnd};
 
-    for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
+    for (size_t i = 0; i < p.totalIterations(); ++i) {
       auto kernelStart = now();
       queue.submit([&](sycl::handler &h) {
         fasten_main(h, wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(),
@@ -231,9 +235,13 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
+    auto deviceToHostStart = now();
     queue.memcpy(std::data(sample.energies), energies, sizeof(float) * std::size(sample.energies));
     queue.wait_and_throw();
 
+    auto deviceToHostEnd = now();
+    sample.deviceToHost = {hostToDeviceStart, hostToDeviceEnd};
+
     return sample;
   };
 };
diff --git a/src/sycl-usm/model.cmake b/src/sycl-usm/model.cmake
index dfef3e6..661347f 100644
--- a/src/sycl-usm/model.cmake
+++ b/src/sycl-usm/model.cmake
@@ -9,7 +9,7 @@ register_flag_required(SYCL_COMPILER
            ONEAPI-ICPX  - icpx as a standalone compiler
            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
-           HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
+           HIPSYCL/ADAPTIVECPP      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
            COMPUTECPP   - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
 
 register_flag_optional(SYCL_COMPILER_DIR
@@ -47,7 +47,7 @@ macro(setup)
         find_package(hipSYCL CONFIG REQUIRED)
         message(STATUS "ok")
 
-    elseif (${SYCL_COMPILER} STREQUAL "ACPP")
+    elseif (${SYCL_COMPILER} STREQUAL "ADAPTIVECPP")
         find_package(AdaptiveCpp CONFIG REQUIRED)
     elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP")
 
@@ -94,7 +94,7 @@ macro(setup_target NAME)
     if (
     (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR
     (${SYCL_COMPILER} STREQUAL "HIPSYCL") OR
-    (${SYCL_COMPILER} STREQUAL "ACPP"))
+    (${SYCL_COMPILER} STREQUAL "ADAPTIVECPP"))
     
         # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their
         # own custom integration header flags AFTER the target has been specified