Skip to content

Commit

Permalink
Use unique_ptr as temporary owner of USM allocation
Browse files Browse the repository at this point in the history
Until it is passed over to the host function, and
unique_ptr's ownership is released.

Also reduced allocation sizes, where too much was being
allocated.

Introduce smart_malloc_device, etc.

The smart_malloc_device<T>(count, q) makes USM allocation
and returns a unique_ptr<T, USMDeleter> which owns the
allocation. The function throws an exception (std::runtime_error)
if USM allocation is not successful.

Introduce async_smart_free.

This function intends to replace use of host_task submissions
to manage USM temporary deallocations.

The usage is as follows:

```
  // returns unique_ptr
  auto alloc_owner = smart_malloc_device<T>(count, q);

  // get raw pointer for use in kernels
  T *data = alloc_owner.get();

  [..SNIP..]

  // submit host_task that releases the unique_ptr
  // after the host task was successfully submitted
  // and ownership of USM allocation is transfered to
  // the said host task
  sycl::event ht_ev =
      async_smart_free(q,
      dependent_events,
      alloc_owner);

  [...SNIP...]
```
  • Loading branch information
oleksandr-pavlyk committed Dec 24, 2024
1 parent eb28d1a commit da3fbcc
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 92 deletions.
66 changes: 21 additions & 45 deletions dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1588,11 +1588,11 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
using CountT = std::uint32_t;

// memory for storing count and offset values
CountT *count_ptr =
sycl::malloc_device<CountT>(n_iters * n_counts, exec_q);
if (nullptr == count_ptr) {
throw std::runtime_error("Could not allocate USM-device memory");
}
auto count_owner =
dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
n_iters * n_counts, exec_q);

CountT *count_ptr = count_owner.get();

constexpr std::uint32_t zero_radix_iter{0};

Expand All @@ -1605,25 +1605,17 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
n_counts, count_ptr, proj_op,
is_ascending, depends);

sort_ev = exec_q.submit([=](sycl::handler &cgh) {
cgh.depends_on(sort_ev);
const sycl::context &ctx = exec_q.get_context();

using dpctl::tensor::alloc_utils::sycl_free_noexcept;
cgh.host_task(
[ctx, count_ptr]() { sycl_free_noexcept(count_ptr, ctx); });
});
sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
exec_q, {sort_ev}, count_owner);

return sort_ev;
}

ValueT *tmp_arr =
sycl::malloc_device<ValueT>(n_iters * n_to_sort, exec_q);
if (nullptr == tmp_arr) {
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
sycl_free_noexcept(count_ptr, exec_q);
throw std::runtime_error("Could not allocate USM-device memory");
}
auto tmp_arr_owner =
dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
n_iters * n_to_sort, exec_q);

ValueT *tmp_arr = tmp_arr_owner.get();

// iterations per each bucket
assert("Number of iterations must be even" && radix_iters % 2 == 0);
Expand Down Expand Up @@ -1657,17 +1649,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
}
}

sort_ev = exec_q.submit([=](sycl::handler &cgh) {
cgh.depends_on(sort_ev);

const sycl::context &ctx = exec_q.get_context();

using dpctl::tensor::alloc_utils::sycl_free_noexcept;
cgh.host_task([ctx, count_ptr, tmp_arr]() {
sycl_free_noexcept(tmp_arr, ctx);
sycl_free_noexcept(count_ptr, ctx);
});
});
sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
exec_q, {sort_ev}, tmp_arr_owner, count_owner);
}

return sort_ev;
Expand Down Expand Up @@ -1769,13 +1752,12 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;

const std::size_t total_nelems = iter_nelems * sort_nelems;
const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
IndexTy *workspace = sycl::malloc_device<IndexTy>(
padded_total_nelems + total_nelems, exec_q);
auto workspace_owner =
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
exec_q);

if (nullptr == workspace) {
throw std::runtime_error("Could not allocate workspace on device");
}
// get raw USM pointer
IndexTy *workspace = workspace_owner.get();

using IdentityProjT = radix_sort_details::IdentityProj;
using IndexedProjT =
Expand Down Expand Up @@ -1820,14 +1802,8 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
});
});

sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(map_back_ev);

const sycl::context &ctx = exec_q.get_context();

using dpctl::tensor::alloc_utils::sycl_free_noexcept;
cgh.host_task([ctx, workspace] { sycl_free_noexcept(workspace, ctx); });
});
sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
exec_q, {map_back_ev}, workspace_owner);

return cleanup_ev;
}
Expand Down
70 changes: 24 additions & 46 deletions dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@
#include <iterator>
#include <limits>
#include <stdexcept>
#include <sycl/sycl.hpp>
#include <vector>

#include <sycl/sycl.hpp>

#include "kernels/dpctl_tensor_types.hpp"
#include "kernels/sorting/merge_sort.hpp"
#include "kernels/sorting/radix_sort.hpp"
Expand Down Expand Up @@ -90,11 +91,11 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
const CompT &comp,
const std::vector<sycl::event> &depends)
{
IndexTy *index_data =
sycl::malloc_device<IndexTy>(iter_nelems * axis_nelems, exec_q);
if (index_data == nullptr) {
throw std::runtime_error("Unable to allocate device_memory");
}
auto index_data_owner =
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
iter_nelems * axis_nelems, exec_q);
// extract USM pointer
IndexTy *index_data = index_data_owner.get();

using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;

Expand Down Expand Up @@ -153,14 +154,8 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
});

sycl::event cleanup_host_task_event =
exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(write_out_ev);
const sycl::context &ctx = exec_q.get_context();

using dpctl::tensor::alloc_utils::sycl_free_noexcept;
cgh.host_task(
[ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
});
dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
index_data_owner);

return cleanup_host_task_event;
};
Expand Down Expand Up @@ -283,11 +278,11 @@ sycl::event topk_merge_impl(
index_comp, depends);
}

IndexTy *index_data =
sycl::malloc_device<IndexTy>(iter_nelems * alloc_len, exec_q);
if (index_data == nullptr) {
throw std::runtime_error("Unable to allocate device_memory");
}
auto index_data_owner =
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
iter_nelems * alloc_len, exec_q);
// get raw USM pointer
IndexTy *index_data = index_data_owner.get();

// no need to populate index data: SLM will be populated with default
// values
Expand Down Expand Up @@ -427,14 +422,8 @@ sycl::event topk_merge_impl(
});

sycl::event cleanup_host_task_event =
exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(write_topk_ev);
const sycl::context &ctx = exec_q.get_context();

using dpctl::tensor::alloc_utils::sycl_free_noexcept;
cgh.host_task(
[ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
});
dpctl::tensor::alloc_utils::async_smart_free(
exec_q, {write_topk_ev}, index_data_owner);

return cleanup_host_task_event;
}
Expand Down Expand Up @@ -474,15 +463,13 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,

const std::size_t total_nelems = iter_nelems * axis_nelems;
const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
IndexTy *workspace = sycl::malloc_device<IndexTy>(
padded_total_nelems + total_nelems, exec_q);
auto workspace_owner =
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
padded_total_nelems + total_nelems, exec_q);

IndexTy *tmp_tp = sycl::malloc_device<IndexTy>(total_nelems, exec_q);

if (nullptr == workspace || nullptr == tmp_tp) {
throw std::runtime_error(
"Not enough device memory for radix sort topk");
}
// get raw USM pointer
IndexTy *workspace = workspace_owner.get();
IndexTy *tmp_tp = workspace + padded_total_nelems;

using IdentityProjT = radix_sort_details::IdentityProj;
using IndexedProjT =
Expand Down Expand Up @@ -536,17 +523,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
});
});

sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(write_topk_ev);

const sycl::context &ctx = exec_q.get_context();

using dpctl::tensor::alloc_utils::sycl_free_noexcept;
cgh.host_task([ctx, workspace, tmp_tp] {
sycl_free_noexcept(workspace, ctx);
sycl_free_noexcept(tmp_tp, ctx);
});
});
sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
exec_q, {write_topk_ev}, workspace_owner);

return cleanup_ev;
}
Expand Down
93 changes: 92 additions & 1 deletion dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@

#include <exception>
#include <iostream>
#include <memory>
#include <stdexcept>
#include <vector>

#include "sycl/sycl.hpp"

Expand Down Expand Up @@ -73,11 +76,99 @@ void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
}
}

template <typename T> void sycl_free_noexcept(T *ptr, sycl::queue &q) noexcept
template <typename T>
void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
{
sycl_free_noexcept(ptr, q.get_context());
}

class USMDeleter
{
private:
sycl::context ctx_;

public:
USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}

template <typename T> void operator()(T *ptr) const
{
sycl_free_noexcept(ptr, ctx_);
}
};

template <typename T>
std::unique_ptr<T, USMDeleter>
smart_malloc(std::size_t count,
const sycl::queue &q,
sycl::usm::alloc kind,
const sycl::property_list &propList = {})
{
T *ptr = sycl::malloc<T>(count, q, kind, propList);
if (nullptr == ptr) {
throw std::runtime_error("Unable to allocate device_memory");
}

auto usm_deleter = USMDeleter(q);
return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
}

template <typename T>
std::unique_ptr<T, USMDeleter>
smart_malloc_device(std::size_t count,
const sycl::queue &q,
const sycl::property_list &propList = {})
{
return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
}

template <typename T>
std::unique_ptr<T, USMDeleter>
smart_malloc_shared(std::size_t count,
const sycl::queue &q,
const sycl::property_list &propList = {})
{
return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
}

template <typename T>
std::unique_ptr<T, USMDeleter>
smart_malloc_jost(std::size_t count,
const sycl::queue &q,
const sycl::property_list &propList = {})
{
return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
}

template <typename... Args>
sycl::event async_smart_free(sycl::queue &exec_q,
const std::vector<sycl::event> &depends,
Args &&...args)
{
constexpr std::size_t n = sizeof...(Args);

std::vector<void *> ptrs;
ptrs.reserve(n);
(ptrs.push_back(reinterpret_cast<void *>(args.get())), ...);

std::vector<USMDeleter> dels;
dels.reserve(n);
(dels.push_back(args.get_deleter()), ...);

sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

cgh.host_task([ptrs, dels]() {
for (size_t i = 0; i < ptrs.size(); ++i) {
dels[i](ptrs[i]);
}
});
});
(args.release(), ...);

return ht_e;
}

} // end of namespace alloc_utils
} // end of namespace tensor
} // end of namespace dpctl

0 comments on commit da3fbcc

Please sign in to comment.