From be9f5ce16ad5dec1af3c1ef744ee68fd6a9b9c55 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 13 Dec 2024 18:48:05 -0600
Subject: [PATCH 01/15] Add test file for top_k functionality

---
 dpctl/tests/test_usm_ndarray_top_k.py | 94 +++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 dpctl/tests/test_usm_ndarray_top_k.py

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
new file mode 100644
index 0000000000..954c21a990
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -0,0 +1,94 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193])
+def test_topk_1d_largest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    zo = dpt.concat((o, z))
+    inp = dpt.roll(zo, 734)
+    k = 5
+
+    s = dpt.top_k(inp, k, mode="largest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.ones(k, dtype=dtype))
+    assert dpt.all(s.values == inp[s.indices])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [33, 255, 257, 513, 1021, 8193])
+def test_topk_1d_smallest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    zo = dpt.concat((o, z))
+    inp = dpt.roll(zo, 734)
+    k = 5
+
+    s = dpt.top_k(inp, k, mode="smallest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype))
+    assert dpt.all(s.values == inp[s.indices])

From ff813652572f0c6e70985d6083d9e96b0bcf143d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 17 Dec 2024 15:05:19 -0800
Subject: [PATCH 02/15] Adds test to triage AMD CPU top_k failure

---
 dpctl/tests/test_usm_ndarray_top_k.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
index 954c21a990..e78f96f399 100644
--- a/dpctl/tests/test_usm_ndarray_top_k.py
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -92,3 +92,24 @@ def test_topk_1d_smallest(dtype, n):
     assert s.indices.shape == (k,)
     assert dpt.all(s.values == dpt.zeros(k, dtype=dtype))
     assert dpt.all(s.values == inp[s.indices])
+
+
+# triage failing top k radix implementation on CPU
+# replicates from Python behavior of radix sort topk implementation
+def test_topk_largest_1d_radix_i1_255():
+    get_queue_or_skip()
+    n = 255
+    dt = "i1"
+
+    o = dpt.ones(n, dtype=dt)
+    z = dpt.zeros(n, dtype=dt)
+    zo = dpt.concat((o, z))
+    inp = dpt.roll(zo, 734)
+    k = 5
+
+    sorted = dpt.copy(dpt.sort(inp, descending=True, kind="radixsort")[:k])
+    argsorted = dpt.copy(
+        dpt.argsort(inp, descending=True, kind="radixsort")[:k]
+    )
+    assert dpt.all(sorted == dpt.ones(k, dtype=dt))
+    assert dpt.all(sorted == inp[argsorted])

From 58da4169f58ef6ffb0cddecd33cecb2e18437924 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 18 Dec 2024 10:12:23 -0800
Subject: [PATCH 03/15] radix sort triage test now parametrizes for identical
 sizes to topk_1d_largest test

---
 dpctl/tests/test_usm_ndarray_top_k.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
index e78f96f399..4e05615c80 100644
--- a/dpctl/tests/test_usm_ndarray_top_k.py
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -96,9 +96,9 @@ def test_topk_1d_smallest(dtype, n):
 
 # triage failing top k radix implementation on CPU
 # replicates from Python behavior of radix sort topk implementation
-def test_topk_largest_1d_radix_i1_255():
+@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193])
+def test_topk_largest_1d_radix_i1_255(n):
     get_queue_or_skip()
-    n = 255
     dt = "i1"
 
     o = dpt.ones(n, dtype=dt)

From fd4bab3dda17ee56408ac9f1d8adf1cb6042111e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 20 Dec 2024 16:51:19 -0600
Subject: [PATCH 04/15] Add sort_utils.hpp with iota_impl

Reuse that function call in sorting code-base where argsort is used.
---
 .../include/kernels/sorting/merge_sort.hpp    |  16 ++-
 .../include/kernels/sorting/radix_sort.hpp    |  51 ++++-----
 .../include/kernels/sorting/sort_utils.hpp    | 103 ++++++++++++++++++
 .../include/kernels/sorting/topk.hpp          |  34 ++++--
 4 files changed, 162 insertions(+), 42 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index f3b5030c48..002da0ff9b 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -33,6 +33,7 @@
 
 #include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
 
 namespace dpctl
 {
@@ -811,20 +812,27 @@ sycl::event stable_argsort_axis1_contig_impl(
 
     const size_t total_nelems = iter_nelems * sort_nelems;
 
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    using IotaKernelName = populate_index_data_krn<argTy, IndexTy, ValueComp>;
+
+#if 1
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, res_tp, total_nelems, depends);
+
+#else
     sycl::event populate_indexed_data_ev =
         exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
             const sycl::range<1> range{total_nelems};
 
-            using KernelName =
-                populate_index_data_krn<argTy, IndexTy, ValueComp>;
-
-            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
+            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
                 size_t i = id[0];
                 res_tp[i] = static_cast<IndexTy>(i);
             });
         });
+#endif
 
     // Sort segments of the array
     sycl::event base_sort_ev =
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index dc3da24315..84edb1dd3b 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -38,6 +38,7 @@
 #include <sycl/sycl.hpp>
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/sort_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 
 namespace dpctl
@@ -1256,9 +1257,7 @@ struct subgroup_radix_sort
                             const uint16_t id = wi * block_size + i;
                             if (id < n)
                                 values[i] = std::move(
-                                    this_input_arr[iter_val_offset +
-                                                   static_cast<std::size_t>(
-                                                       id)]);
+                                    this_input_arr[iter_val_offset + id]);
                         }
 
                         while (true) {
@@ -1272,8 +1271,7 @@ struct subgroup_radix_sort
                                 // counting phase
                                 auto pcounter =
                                     get_accessor_pointer(counter_acc) +
-                                    static_cast<std::size_t>(wi) +
-                                    iter_counter_offset;
+                                    (wi + iter_counter_offset);
 
 // initialize counters
 #pragma unroll
@@ -1348,19 +1346,15 @@ struct subgroup_radix_sort
 
                                     // scan contiguous numbers
                                     uint16_t bin_sum[bin_count];
-                                    bin_sum[0] =
-                                        counter_acc[iter_counter_offset +
-                                                    static_cast<std::size_t>(
-                                                        wi * bin_count)];
+                                    const std::size_t counter_offset0 =
+                                        iter_counter_offset + wi * bin_count;
+                                    bin_sum[0] = counter_acc[counter_offset0];
 
 #pragma unroll
                                     for (uint16_t i = 1; i < bin_count; ++i)
                                         bin_sum[i] =
                                             bin_sum[i - 1] +
-                                            counter_acc
-                                                [iter_counter_offset +
-                                                 static_cast<std::size_t>(
-                                                     wi * bin_count + i)];
+                                            counter_acc[counter_offset0 + i];
 
                                     sycl::group_barrier(ndit.get_group());
 
@@ -1374,10 +1368,7 @@ struct subgroup_radix_sort
 // add to local sum, generate exclusive scan result
 #pragma unroll
                                     for (uint16_t i = 0; i < bin_count; ++i)
-                                        counter_acc[iter_counter_offset +
-                                                    static_cast<std::size_t>(
-                                                        wi * bin_count + i +
-                                                        1)] =
+                                        counter_acc[counter_offset0 + i + 1] =
                                             sum_scan + bin_sum[i];
 
                                     if (wi == 0)
@@ -1407,10 +1398,8 @@ struct subgroup_radix_sort
                                     if (r < n) {
                                         // move the values to source range and
                                         // destroy the values
-                                        this_output_arr
-                                            [iter_val_offset +
-                                             static_cast<std::size_t>(r)] =
-                                                std::move(values[i]);
+                                        this_output_arr[iter_val_offset + r] =
+                                            std::move(values[i]);
                                     }
                                 }
 
@@ -1422,8 +1411,7 @@ struct subgroup_radix_sort
                             for (uint16_t i = 0; i < block_size; ++i) {
                                 const uint16_t r = indices[i];
                                 if (r < n)
-                                    exchange_acc[iter_exchange_offset +
-                                                 static_cast<std::size_t>(r)] =
+                                    exchange_acc[iter_exchange_offset + r] =
                                         std::move(values[i]);
                             }
 
@@ -1435,8 +1423,7 @@ struct subgroup_radix_sort
                                 if (id < n)
                                     values[i] = std::move(
                                         exchange_acc[iter_exchange_offset +
-                                                     static_cast<std::size_t>(
-                                                         id)]);
+                                                     id]);
                             }
 
                             sycl::group_barrier(ndit.get_group());
@@ -1795,18 +1782,26 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
         radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
     const IndexedProjT proj_op{arg_tp, IdentityProjT{}};
 
+    using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
+
+#if 1
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+#else
+
     sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        using KernelName = radix_argsort_iota_krn<argTy, IndexTy>;
-
-        cgh.parallel_for<KernelName>(
+        cgh.parallel_for<IotaKernelName>(
             sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
                 size_t i = id[0];
                 IndexTy sort_id = static_cast<IndexTy>(i);
                 workspace[i] = sort_id;
             });
     });
+#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
new file mode 100644
index 0000000000..1b1c361d75
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -0,0 +1,103 @@
+//=== sorting.hpp -  Implementation of sorting kernels       ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace sort_utils_detail
+{
+
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+template <class KernelName, typename T>
+sycl::event iota_impl(sycl::queue &exec_q,
+                      T *data,
+                      std::size_t nelems,
+                      const std::vector<sycl::event> &dependent_events)
+{
+    constexpr std::uint32_t lws = 256;
+    constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws);
+
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t offset = (gid - lane_id) * n_wi;
+            const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+
+            std::array<T, n_wi> stripe{};
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                stripe[i] = T(offset + lane_id + i * max_sgSize);
+            }
+
+            if (offset + n_wi * max_sgSize < nelems) {
+                constexpr auto group_ls_props = syclexp::properties{
+                    syclexp::data_placement_striped
+                    // , syclexp::full_group
+                };
+
+                auto out_multi_ptr = sycl::address_space_cast<
+                    sycl::access::address_space::global_space,
+                    sycl::access::decorated::yes>(&data[offset]);
+
+                syclexp::group_store(sg, sycl::span<T, n_wi>{&stripe[0], n_wi},
+                                     out_multi_ptr, group_ls_props);
+            }
+            else {
+                for (std::size_t idx = offset + lane_id; idx < nelems;
+                     idx += max_sgSize)
+                {
+                    data[idx] = T(idx);
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+} // end of namespace sort_utils_detail
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 2674f877c9..da9e0bd812 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -34,9 +34,10 @@
 #include <vector>
 
 #include "kernels/dpctl_tensor_types.hpp"
-#include "merge_sort.hpp"
-#include "radix_sort.hpp"
-#include "search_sorted_detail.hpp"
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include <sycl/ext/oneapi/sub_group_mask.hpp>
 
@@ -95,20 +96,26 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
         throw std::runtime_error("Unable to allocate device_memory");
     }
 
+    using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
+
+#if 1
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, index_data, iter_nelems * axis_nelems, depends);
+#else
     sycl::event populate_indexed_data_ev =
         exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
             auto const &range = sycl::range<1>(iter_nelems * axis_nelems);
 
-            using KernelName =
-                topk_populate_index_data_krn<argTy, IndexTy, CompT>;
-
-            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
+            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
                 std::size_t i = id[0];
                 index_data[i] = static_cast<IndexTy>(i);
             });
         });
+#endif
 
     std::size_t sorted_block_size;
     // Sort segments of the array
@@ -480,18 +487,25 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
         radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
     const IndexedProjT proj_op{arg_tp, IdentityProjT{}};
 
+    using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
+
+#if 1
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+#else
     sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        using KernelName = topk_iota_krn<argTy, IndexTy>;
-
-        cgh.parallel_for<KernelName>(
+        cgh.parallel_for<IotaKernelName>(
             sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
                 size_t i = id[0];
                 IndexTy sort_id = static_cast<IndexTy>(i);
                 workspace[i] = sort_id;
             });
     });
+#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(

From d9cc9195cc2f73befee518fb3b87055d83b7324d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 22 Dec 2024 14:55:45 -0600
Subject: [PATCH 05/15] Simplify write-out kernels in topk implementation
 (avoid recomputing gid)

---
 .../include/kernels/sorting/topk.hpp          | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index da9e0bd812..7ea91ad148 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -410,17 +410,17 @@ sycl::event topk_merge_impl(
                 topk_partial_merge_map_back_krn<argTy, IndexTy, ValueComp>;
 
             cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-                std::size_t gid = id[0];
+                const std::size_t gid = id[0];
 
-                std::size_t iter_gid = gid / k;
-                std::size_t axis_gid = gid - (iter_gid * k);
+                const std::size_t iter_gid = gid / k;
+                const std::size_t axis_gid = gid - (iter_gid * k);
 
-                std::size_t src_idx = iter_gid * alloc_len + axis_gid;
-                std::size_t dst_idx = iter_gid * k + axis_gid;
+                const std::size_t src_idx = iter_gid * alloc_len + axis_gid;
+                const std::size_t dst_idx = gid;
 
-                auto res_ind = index_data[src_idx];
+                const auto res_ind = index_data[src_idx];
                 vals_tp[dst_idx] = arg_tp[res_ind];
-                inds_tp[dst_idx] = res_ind % axis_nelems;
+                inds_tp[dst_idx] = (res_ind % axis_nelems);
             });
         });
 
@@ -519,17 +519,17 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
         using KernelName = topk_radix_map_back_krn<argTy, IndexTy>;
 
         cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            std::size_t gid = id[0];
+            const std::size_t gid = id[0];
 
-            std::size_t iter_gid = gid / k;
-            std::size_t axis_gid = gid - (iter_gid * k);
+            const std::size_t iter_gid = gid / k;
+            const std::size_t axis_gid = gid - (iter_gid * k);
 
-            std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
-            std::size_t dst_idx = iter_gid * k + axis_gid;
+            const std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
+            const std::size_t dst_idx = gid;
 
-            IndexTy res_ind = tmp_tp[src_idx];
+            const IndexTy res_ind = tmp_tp[src_idx];
             vals_tp[dst_idx] = arg_tp[res_ind];
-            inds_tp[dst_idx] = res_ind % axis_nelems;
+            inds_tp[dst_idx] = (res_ind % axis_nelems);
         });
     });
 

From eb28d1a939909e1170a625cb008afa5ae82fe05a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 22 Dec 2024 14:56:56 -0600
Subject: [PATCH 06/15] Add explicit read into registers

---
 .../libtensor/include/kernels/sorting/topk.hpp      | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 7ea91ad148..6698aa50de 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -145,8 +145,9 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
             std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
             std::size_t dst_idx = iter_gid * k + axis_gid;
 
-            auto res_ind = index_data[src_idx];
-            vals_tp[dst_idx] = arg_tp[res_ind];
+            const IndexTy res_ind = index_data[src_idx];
+            const argTy v = arg_tp[res_ind];
+            vals_tp[dst_idx] = v;
             inds_tp[dst_idx] = res_ind % axis_nelems;
         });
     });
@@ -418,8 +419,9 @@ sycl::event topk_merge_impl(
                 const std::size_t src_idx = iter_gid * alloc_len + axis_gid;
                 const std::size_t dst_idx = gid;
 
-                const auto res_ind = index_data[src_idx];
-                vals_tp[dst_idx] = arg_tp[res_ind];
+                const IndexTy res_ind = index_data[src_idx];
+                const argTy v = arg_tp[res_ind];
+                vals_tp[dst_idx] = v;
                 inds_tp[dst_idx] = (res_ind % axis_nelems);
             });
         });
@@ -528,7 +530,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
             const std::size_t dst_idx = gid;
 
             const IndexTy res_ind = tmp_tp[src_idx];
-            vals_tp[dst_idx] = arg_tp[res_ind];
+            const argTy v = arg_tp[res_ind];
+            vals_tp[dst_idx] = v;
             inds_tp[dst_idx] = (res_ind % axis_nelems);
         });
     });

From da3fbcc45cae158a511e0ae6591ed8fd9dba456c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 20 Dec 2024 16:54:15 -0600
Subject: [PATCH 07/15] Use unique_ptr as temporary owner of USM allocation

Until it is passed over to the host function, and
unique_ptr's ownership is released.

Also reduced allocation sizes, where too much was being
allocated.

Introduce smart_malloc_device, etc.

The smart_malloc_device<T>(count, q) makes USM allocation
and returns a unique_ptr<T, USMDeleter> which owns the
allocation. The function throws an exception (std::runtime_error)
if USM allocation is not successful.

Introduce async_smart_free.

This function intends to replace use of host_task submissions
to manage USM temporary deallocations.

The usage is as follows:

```
  // returns unique_ptr
  auto alloc_owner = smart_malloc_device<T>(count, q);

  // get raw pointer for use in kernels
  T *data = alloc_owner.get();

  [..SNIP..]

  // submit host_task that releases the unique_ptr
  // after the host task was successfully submitted
  // and ownership of USM allocation is transfered to
  // the said host task
  sycl::event ht_ev =
      async_smart_free(q,
      dependent_events,
      alloc_owner);

  [...SNIP...]
```
---
 .../include/kernels/sorting/radix_sort.hpp    | 66 +++++--------
 .../include/kernels/sorting/topk.hpp          | 70 +++++---------
 .../include/utils/sycl_alloc_utils.hpp        | 93 ++++++++++++++++++-
 3 files changed, 137 insertions(+), 92 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 84edb1dd3b..de0011e2ab 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -1588,11 +1588,11 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         using CountT = std::uint32_t;
 
         // memory for storing count and offset values
-        CountT *count_ptr =
-            sycl::malloc_device<CountT>(n_iters * n_counts, exec_q);
-        if (nullptr == count_ptr) {
-            throw std::runtime_error("Could not allocate USM-device memory");
-        }
+        auto count_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
+                n_iters * n_counts, exec_q);
+
+        CountT *count_ptr = count_owner.get();
 
         constexpr std::uint32_t zero_radix_iter{0};
 
@@ -1605,25 +1605,17 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
                                                    n_counts, count_ptr, proj_op,
                                                    is_ascending, depends);
 
-            sort_ev = exec_q.submit([=](sycl::handler &cgh) {
-                cgh.depends_on(sort_ev);
-                const sycl::context &ctx = exec_q.get_context();
-
-                using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-                cgh.host_task(
-                    [ctx, count_ptr]() { sycl_free_noexcept(count_ptr, ctx); });
-            });
+            sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {sort_ev}, count_owner);
 
             return sort_ev;
         }
 
-        ValueT *tmp_arr =
-            sycl::malloc_device<ValueT>(n_iters * n_to_sort, exec_q);
-        if (nullptr == tmp_arr) {
-            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-            sycl_free_noexcept(count_ptr, exec_q);
-            throw std::runtime_error("Could not allocate USM-device memory");
-        }
+        auto tmp_arr_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
+                n_iters * n_to_sort, exec_q);
+
+        ValueT *tmp_arr = tmp_arr_owner.get();
 
         // iterations per each bucket
         assert("Number of iterations must be even" && radix_iters % 2 == 0);
@@ -1657,17 +1649,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
             }
         }
 
-        sort_ev = exec_q.submit([=](sycl::handler &cgh) {
-            cgh.depends_on(sort_ev);
-
-            const sycl::context &ctx = exec_q.get_context();
-
-            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-            cgh.host_task([ctx, count_ptr, tmp_arr]() {
-                sycl_free_noexcept(tmp_arr, ctx);
-                sycl_free_noexcept(count_ptr, ctx);
-            });
-        });
+        sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {sort_ev}, tmp_arr_owner, count_owner);
     }
 
     return sort_ev;
@@ -1769,13 +1752,12 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
         reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
 
     const std::size_t total_nelems = iter_nelems * sort_nelems;
-    const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
-    IndexTy *workspace = sycl::malloc_device<IndexTy>(
-        padded_total_nelems + total_nelems, exec_q);
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
+                                                                 exec_q);
 
-    if (nullptr == workspace) {
-        throw std::runtime_error("Could not allocate workspace on device");
-    }
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
 
     using IdentityProjT = radix_sort_details::IdentityProj;
     using IndexedProjT =
@@ -1820,14 +1802,8 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
             });
     });
 
-    sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(map_back_ev);
-
-        const sycl::context &ctx = exec_q.get_context();
-
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task([ctx, workspace] { sycl_free_noexcept(workspace, ctx); });
-    });
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {map_back_ev}, workspace_owner);
 
     return cleanup_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 6698aa50de..d1d686a2dd 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -30,9 +30,10 @@
 #include <iterator>
 #include <limits>
 #include <stdexcept>
-#include <sycl/sycl.hpp>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/sorting/merge_sort.hpp"
 #include "kernels/sorting/radix_sort.hpp"
@@ -90,11 +91,11 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
                           const CompT &comp,
                           const std::vector<sycl::event> &depends)
 {
-    IndexTy *index_data =
-        sycl::malloc_device<IndexTy>(iter_nelems * axis_nelems, exec_q);
-    if (index_data == nullptr) {
-        throw std::runtime_error("Unable to allocate device_memory");
-    }
+    auto index_data_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            iter_nelems * axis_nelems, exec_q);
+    // extract USM pointer
+    IndexTy *index_data = index_data_owner.get();
 
     using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
 
@@ -153,14 +154,8 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
     });
 
     sycl::event cleanup_host_task_event =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(write_out_ev);
-            const sycl::context &ctx = exec_q.get_context();
-
-            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-            cgh.host_task(
-                [ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
-        });
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
+                                                     index_data_owner);
 
     return cleanup_host_task_event;
 };
@@ -283,11 +278,11 @@ sycl::event topk_merge_impl(
                                              index_comp, depends);
         }
 
-        IndexTy *index_data =
-            sycl::malloc_device<IndexTy>(iter_nelems * alloc_len, exec_q);
-        if (index_data == nullptr) {
-            throw std::runtime_error("Unable to allocate device_memory");
-        }
+        auto index_data_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+                iter_nelems * alloc_len, exec_q);
+        // get raw USM pointer
+        IndexTy *index_data = index_data_owner.get();
 
         // no need to populate index data: SLM will be populated with default
         // values
@@ -427,14 +422,8 @@ sycl::event topk_merge_impl(
         });
 
         sycl::event cleanup_host_task_event =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(write_topk_ev);
-                const sycl::context &ctx = exec_q.get_context();
-
-                using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-                cgh.host_task(
-                    [ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
-            });
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {write_topk_ev}, index_data_owner);
 
         return cleanup_host_task_event;
     }
@@ -474,15 +463,13 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
 
     const std::size_t total_nelems = iter_nelems * axis_nelems;
     const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
-    IndexTy *workspace = sycl::malloc_device<IndexTy>(
-        padded_total_nelems + total_nelems, exec_q);
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            padded_total_nelems + total_nelems, exec_q);
 
-    IndexTy *tmp_tp = sycl::malloc_device<IndexTy>(total_nelems, exec_q);
-
-    if (nullptr == workspace || nullptr == tmp_tp) {
-        throw std::runtime_error(
-            "Not enough device memory for radix sort topk");
-    }
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+    IndexTy *tmp_tp = workspace + padded_total_nelems;
 
     using IdentityProjT = radix_sort_details::IdentityProj;
     using IndexedProjT =
@@ -536,17 +523,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
         });
     });
 
-    sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(write_topk_ev);
-
-        const sycl::context &ctx = exec_q.get_context();
-
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task([ctx, workspace, tmp_tp] {
-            sycl_free_noexcept(workspace, ctx);
-            sycl_free_noexcept(tmp_tp, ctx);
-        });
-    });
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {write_topk_ev}, workspace_owner);
 
     return cleanup_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
index 3ad5f6f36a..87476221d8 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -28,6 +28,9 @@
 
 #include <exception>
 #include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <vector>
 
 #include "sycl/sycl.hpp"
 
@@ -73,11 +76,99 @@ void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
     }
 }
 
-template <typename T> void sycl_free_noexcept(T *ptr, sycl::queue &q) noexcept
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
 {
     sycl_free_noexcept(ptr, q.get_context());
 }
 
+class USMDeleter
+{
+private:
+    sycl::context ctx_;
+
+public:
+    USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
+    USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}
+
+    template <typename T> void operator()(T *ptr) const
+    {
+        sycl_free_noexcept(ptr, ctx_);
+    }
+};
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc(std::size_t count,
+             const sycl::queue &q,
+             sycl::usm::alloc kind,
+             const sycl::property_list &propList = {})
+{
+    T *ptr = sycl::malloc<T>(count, q, kind, propList);
+    if (nullptr == ptr) {
+        throw std::runtime_error("Unable to allocate device_memory");
+    }
+
+    auto usm_deleter = USMDeleter(q);
+    return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc_device(std::size_t count,
+                    const sycl::queue &q,
+                    const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc_shared(std::size_t count,
+                    const sycl::queue &q,
+                    const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc_jost(std::size_t count,
+                  const sycl::queue &q,
+                  const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
+}
+
+template <typename... Args>
+sycl::event async_smart_free(sycl::queue &exec_q,
+                             const std::vector<sycl::event> &depends,
+                             Args &&...args)
+{
+    constexpr std::size_t n = sizeof...(Args);
+
+    std::vector<void *> ptrs;
+    ptrs.reserve(n);
+    (ptrs.push_back(reinterpret_cast<void *>(args.get())), ...);
+
+    std::vector<USMDeleter> dels;
+    dels.reserve(n);
+    (dels.push_back(args.get_deleter()), ...);
+
+    sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.host_task([ptrs, dels]() {
+            for (size_t i = 0; i < ptrs.size(); ++i) {
+                dels[i](ptrs[i]);
+            }
+        });
+    });
+    (args.release(), ...);
+
+    return ht_e;
+}
+
 } // end of namespace alloc_utils
 } // end of namespace tensor
 } // end of namespace dpctl

From 79feecd5c2785d971a5aeb84cfacd78df7e5d431 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 06:41:40 -0600
Subject: [PATCH 08/15] Factor out write-out kernel into separate detail
 function

Replaced three duplicates of the same kernel with calls to
this function.
---
 .../include/kernels/sorting/topk.hpp          | 110 +++++++++---------
 1 file changed, 52 insertions(+), 58 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index d1d686a2dd..3d07b26802 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -71,6 +71,41 @@ void scale_topk_params(const std::uint64_t nelems_per_slm,
     throw std::runtime_error("Could not construct top k kernel parameters");
 }
 
+template <class KernelName, typename argTy, typename IndexTy>
+sycl::event write_out_impl(sycl::queue &exec_q,
+                           std::size_t iter_nelems,
+                           std::size_t k,
+                           const argTy *arg_tp,
+                           const IndexTy *index_data,
+                           std::size_t iter_index_stride,
+                           std::size_t axis_nelems,
+                           argTy *vals_tp,
+                           IndexTy *inds_tp,
+                           const std::vector<sycl::event> &depends)
+{
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
+            const std::size_t gid = id[0];
+
+            const std::size_t iter_gid = gid / k;
+            const std::size_t axis_gid = gid - (iter_gid * k);
+
+            const std::size_t src_idx = iter_gid * iter_index_stride + axis_gid;
+            const std::size_t dst_idx = gid;
+
+            const IndexTy res_ind = index_data[src_idx];
+            const argTy v = arg_tp[res_ind];
+
+            vals_tp[dst_idx] = v;
+            inds_tp[dst_idx] = (res_ind % axis_nelems);
+        });
+    });
+
+    return write_out_ev;
+}
+
 } // namespace topk_detail
 
 template <typename T1, typename T2, typename T3>
@@ -132,26 +167,13 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
         exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size,
         {base_sort_ev});
 
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(merges_ev);
-
-        using KernelName = topk_full_merge_map_back_krn<argTy, IndexTy, CompT>;
-
-        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            std::size_t gid = id[0];
-
-            std::size_t iter_gid = gid / k;
-            std::size_t axis_gid = gid - (iter_gid * k);
+    using WriteOutKernelName =
+        topk_full_merge_map_back_krn<argTy, IndexTy, CompT>;
 
-            std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
-            std::size_t dst_idx = iter_gid * k + axis_gid;
-
-            const IndexTy res_ind = index_data[src_idx];
-            const argTy v = arg_tp[res_ind];
-            vals_tp[dst_idx] = v;
-            inds_tp[dst_idx] = res_ind % axis_nelems;
-        });
-    });
+    sycl::event write_out_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems,
+            axis_nelems, vals_tp, inds_tp, {merges_ev});
 
     sycl::event cleanup_host_task_event =
         dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
@@ -399,27 +421,13 @@ sycl::event topk_merge_impl(
                 k_rounded, {base_sort_ev});
 
         // Write out top k of the merge-sorted memory
-        sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(merges_ev);
-
-            using KernelName =
-                topk_partial_merge_map_back_krn<argTy, IndexTy, ValueComp>;
+        using WriteOutKernelName =
+            topk_partial_merge_map_back_krn<argTy, IndexTy, ValueComp>;
 
-            cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-                const std::size_t gid = id[0];
-
-                const std::size_t iter_gid = gid / k;
-                const std::size_t axis_gid = gid - (iter_gid * k);
-
-                const std::size_t src_idx = iter_gid * alloc_len + axis_gid;
-                const std::size_t dst_idx = gid;
-
-                const IndexTy res_ind = index_data[src_idx];
-                const argTy v = arg_tp[res_ind];
-                vals_tp[dst_idx] = v;
-                inds_tp[dst_idx] = (res_ind % axis_nelems);
-            });
-        });
+        sycl::event write_topk_ev =
+            topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+                exec_q, iter_nelems, k, arg_tp, index_data, alloc_len,
+                axis_nelems, vals_tp, inds_tp, {merges_ev});
 
         sycl::event cleanup_host_task_event =
             dpctl::tensor::alloc_utils::async_smart_free(
@@ -502,26 +510,12 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
             ascending, {iota_ev});
 
     // Write out top k of the temporary
-    sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(radix_sort_ev);
-
-        using KernelName = topk_radix_map_back_krn<argTy, IndexTy>;
+    using WriteOutKernelName = topk_radix_map_back_krn<argTy, IndexTy>;
 
-        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            const std::size_t gid = id[0];
-
-            const std::size_t iter_gid = gid / k;
-            const std::size_t axis_gid = gid - (iter_gid * k);
-
-            const std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
-            const std::size_t dst_idx = gid;
-
-            const IndexTy res_ind = tmp_tp[src_idx];
-            const argTy v = arg_tp[res_ind];
-            vals_tp[dst_idx] = v;
-            inds_tp[dst_idx] = (res_ind % axis_nelems);
-        });
-    });
+    sycl::event write_topk_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems,
+            vals_tp, inds_tp, {radix_sort_ev});
 
     sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
         exec_q, {write_topk_ev}, workspace_owner);

From 83b1771fbef8500f43c1d98f390ce11b3f9e1e56 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 06:43:46 -0600
Subject: [PATCH 09/15] Typo: smart_malloc_jost -> smart_malloc_host

---
 dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
index 87476221d8..72440ef969 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -133,7 +133,7 @@ smart_malloc_shared(std::size_t count,
 
 template <typename T>
 std::unique_ptr<T, USMDeleter>
-smart_malloc_jost(std::size_t count,
+smart_malloc_host(std::size_t count,
                   const sycl::queue &q,
                   const sycl::property_list &propList = {})
 {

From 1c25d0b758c7b2a54e7374b6ec7c84b9cb35055e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 11:05:22 -0600
Subject: [PATCH 10/15] Reimplemented map_back_impl to process few elements per
 work-item

Factored out map_back_impl projects indexing from flat index to a
row-wise index.

Removed dead code excluded by preprocessor conditional.
---
 .../include/kernels/sorting/merge_sort.hpp    | 33 ++---------
 .../include/kernels/sorting/radix_sort.hpp    | 28 ++--------
 .../include/kernels/sorting/sort_utils.hpp    | 21 +++++++
 .../include/kernels/sorting/topk.hpp          | 55 ++++++++++++++-----
 4 files changed, 70 insertions(+), 67 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index 002da0ff9b..dbf40c10fe 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -816,24 +816,9 @@ sycl::event stable_argsort_axis1_contig_impl(
 
     using IotaKernelName = populate_index_data_krn<argTy, IndexTy, ValueComp>;
 
-#if 1
     sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, res_tp, total_nelems, depends);
 
-#else
-    sycl::event populate_indexed_data_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            const sycl::range<1> range{total_nelems};
-
-            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
-                size_t i = id[0];
-                res_tp[i] = static_cast<IndexTy>(i);
-            });
-        });
-#endif
-
     // Sort segments of the array
     sycl::event base_sort_ev =
         merge_sort_detail::sort_over_work_group_contig_impl(
@@ -847,21 +832,11 @@ sycl::event stable_argsort_axis1_contig_impl(
         exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
         {base_sort_ev});
 
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(merges_ev);
-
-        auto temp_acc =
-            merge_sort_detail::GetReadOnlyAccess<decltype(res_tp)>{}(res_tp,
-                                                                     cgh);
+    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
 
-        using KernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
-
-        const sycl::range<1> range{total_nelems};
-
-        cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
-            res_tp[id] = (temp_acc[id] % sort_nelems);
-        });
-    });
+    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
 
     return write_out_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index de0011e2ab..811aa581ff 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -1766,41 +1766,21 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
 
     using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
 
-#if 1
     using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
     sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, workspace, total_nelems, depends);
-#else
-
-    sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<IotaKernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                size_t i = id[0];
-                IndexTy sort_id = static_cast<IndexTy>(i);
-                workspace[i] = sort_id;
-            });
-    });
-#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
             exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op,
             sort_ascending, {iota_ev});
 
-    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(radix_sort_ev);
-
-        using KernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using MapBackKernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
 
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                IndexTy linear_index = res_tp[id];
-                res_tp[id] = (linear_index % sort_nelems);
-            });
-    });
+    sycl::event map_back_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {radix_sort_ev});
 
     sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
         exec_q, {map_back_ev}, workspace_owner);
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
index 1b1c361d75..f62a6c3fa0 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -97,6 +97,27 @@ sycl::event iota_impl(sycl::queue &exec_q,
     return e;
 }
 
+template <class KernelName, typename IndexTy>
+sycl::event map_back_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          const IndexTy *flat_index_data,
+                          IndexTy *reduced_index_data,
+                          std::size_t row_size,
+                          const std::vector<sycl::event> &dependent_events)
+{
+    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems), [=](sycl::id<1> id) {
+                const IndexTy linear_index = flat_index_data[id];
+                reduced_index_data[id] = (linear_index % row_size);
+            });
+    });
+
+    return map_back_ev;
+}
+
 } // end of namespace sort_utils_detail
 } // end of namespace kernels
 } // end of namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 3d07b26802..6b235a1427 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -83,23 +83,50 @@ sycl::event write_out_impl(sycl::queue &exec_q,
                            IndexTy *inds_tp,
                            const std::vector<sycl::event> &depends)
 {
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            const std::size_t gid = id[0];
+    constexpr std::uint32_t lws = 64;
+    constexpr std::uint32_t n_wi = 4;
+    const std::size_t nelems = iter_nelems * k;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
 
-            const std::size_t iter_gid = gid / k;
-            const std::size_t axis_gid = gid - (iter_gid * k);
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
 
-            const std::size_t src_idx = iter_gid * iter_index_stride + axis_gid;
-            const std::size_t dst_idx = gid;
-
-            const IndexTy res_ind = index_data[src_idx];
-            const argTy v = arg_tp[res_ind];
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
-            vals_tp[dst_idx] = v;
-            inds_tp[dst_idx] = (res_ind % axis_nelems);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id =
+                (gid - lane_id) * sg_size * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const std::size_t iter_id = data_id / k;
+
+                    /*
+                    const std::size_t axis_gid = data_id - (iter_gid * k);
+                    const std::size_t src_idx = iter_gid * iter_index_stride +
+                    axis_gid;
+                    */
+                    const std::size_t src_idx =
+                        data_id + iter_id * (iter_index_stride - k);
+
+                    const IndexTy res_ind = index_data[src_idx];
+                    const argTy v = arg_tp[res_ind];
+
+                    const std::size_t dst_idx = data_id;
+                    vals_tp[dst_idx] = v;
+                    inds_tp[dst_idx] = (res_ind % axis_nelems);
+                }
+            }
         });
     });
 

From a74c788dc25d7dc2195b9dee912f66209b95e3c6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 16:51:13 -0600
Subject: [PATCH 11/15] Replace use of std::log2(size_t_value)

Replaced it with hand-written implementation of ceil_log2(n),
such that n <= (dectype(n){1} << ceil_log2(n)) is true for all
positive values of `n` in the range.
---
 .../include/kernels/sorting/radix_sort.hpp    | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 811aa581ff..887841acb8 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -63,6 +63,45 @@ class radix_sort_reorder_peer_kernel;
 template <std::uint32_t, bool, typename... TrailingNames>
 class radix_sort_reorder_kernel;
 
+/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */
+template <typename SizeT,
+          std::enable_if_t<std::is_unsigned_v<SizeT> &&
+                               sizeof(SizeT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint32_t ceil_log2(SizeT n)
+{
+    if (n <= 1)
+        return std::uint32_t{1};
+
+    std::uint32_t exp{1};
+    --n;
+    if (n >= (SizeT{1} << 32)) {
+        n >>= 32;
+        exp += 32;
+    }
+    if (n >= (SizeT{1} << 16)) {
+        n >>= 16;
+        exp += 16;
+    }
+    if (n >= (SizeT{1} << 8)) {
+        n >>= 8;
+        exp += 8;
+    }
+    if (n >= (SizeT{1} << 4)) {
+        n >>= 4;
+        exp += 4;
+    }
+    if (n >= (SizeT{1} << 2)) {
+        n >>= 2;
+        exp += 2;
+    }
+    if (n >= (SizeT{1} << 1)) {
+        n >>= 1;
+        ++exp;
+    }
+    return exp;
+}
+
 //----------------------------------------------------------
 // bitwise order-preserving conversions to unsigned integers
 //----------------------------------------------------------
@@ -1145,7 +1184,7 @@ struct subgroup_radix_sort
         const std::size_t max_slm_size =
             dev.template get_info<sycl::info::device::local_mem_size>() / 2;
 
-        const auto n_uniform = 1 << (std::uint32_t(std::log2(n - 1)) + 1);
+        const auto n_uniform = 1 << ceil_log2(n);
         const auto req_slm_size_val = sizeof(T) * n_uniform;
 
         return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size)

From 9294095edb137885dad076bb535c127bb0eab511 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 26 Dec 2024 21:03:59 -0600
Subject: [PATCH 12/15] Remove dead code disable via preprocessor conditional

---
 .../include/kernels/sorting/topk.hpp          | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 6b235a1427..0ff3fc4723 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -161,24 +161,10 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
 
     using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
 
-#if 1
     using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
     sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, index_data, iter_nelems * axis_nelems, depends);
-#else
-    sycl::event populate_indexed_data_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            auto const &range = sycl::range<1>(iter_nelems * axis_nelems);
-
-            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
-                std::size_t i = id[0];
-                index_data[i] = static_cast<IndexTy>(i);
-            });
-        });
-#endif
 
     std::size_t sorted_block_size;
     // Sort segments of the array
@@ -513,23 +499,10 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
 
     using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
 
-#if 1
     using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
     sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, workspace, total_nelems, depends);
-#else
-    sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<IotaKernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                size_t i = id[0];
-                IndexTy sort_id = static_cast<IndexTy>(i);
-                workspace[i] = sort_id;
-            });
-    });
-#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(

From 1bb83bc33ad73b55605536d4d4f7f1e185575bf0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 13:20:00 -0600
Subject: [PATCH 13/15] Add information displayed on failure, renamed variables

Add check of computed against expected indices
---
 dpctl/tests/test_usm_ndarray_top_k.py | 157 +++++++++++++++++++++-----
 1 file changed, 131 insertions(+), 26 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
index 4e05615c80..a029db005c 100644
--- a/dpctl/tests/test_usm_ndarray_top_k.py
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -20,6 +20,38 @@
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 
+def _expected_largest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='largest'"
+    assert k < n
+    ones_start_id = shift % (2 * n)
+
+    alloc_dev = inp.device
+
+    if ones_start_id < n:
+        expected_inds = dpt.arange(
+            ones_start_id, ones_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        # wrap-around
+        ones_end_id = (ones_start_id + n) % (2 * n)
+        if ones_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(ones_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        ones_start_id,
+                        ones_start_id + k - ones_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
+
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -38,23 +70,57 @@
         "c16",
     ],
 )
-@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193])
-def test_topk_1d_largest(dtype, n):
+@pytest.mark.parametrize("n", [33, 43, 255, 511, 1021, 8193])
+def test_top_k_1d_largest(dtype, n):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
 
+    shift, k = 734, 5
     o = dpt.ones(n, dtype=dtype)
     z = dpt.zeros(n, dtype=dtype)
-    zo = dpt.concat((o, z))
-    inp = dpt.roll(zo, 734)
-    k = 5
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_largest_inds(oz, n, shift, k)
 
     s = dpt.top_k(inp, k, mode="largest")
     assert s.values.shape == (k,)
     assert s.values.dtype == inp.dtype
     assert s.indices.shape == (k,)
-    assert dpt.all(s.values == dpt.ones(k, dtype=dtype))
-    assert dpt.all(s.values == inp[s.indices])
+    assert dpt.all(s.indices == expected_inds)
+    assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices
+
+
+def _expected_smallest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='smallest'"
+    assert k < n
+    zeros_start_id = (n + shift) % (2 * n)
+    zeros_end_id = (shift) % (2 * n)
+
+    alloc_dev = inp.device
+
+    if zeros_start_id < zeros_end_id:
+        expected_inds = dpt.arange(
+            zeros_start_id, zeros_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        if zeros_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(zeros_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        zeros_start_id,
+                        zeros_start_id + k - zeros_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
 
 
 @pytest.mark.parametrize(
@@ -75,41 +141,80 @@ def test_topk_1d_largest(dtype, n):
         "c16",
     ],
 )
-@pytest.mark.parametrize("n", [33, 255, 257, 513, 1021, 8193])
-def test_topk_1d_smallest(dtype, n):
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_1d_smallest(dtype, n):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
 
+    shift, k = 734, 5
     o = dpt.ones(n, dtype=dtype)
     z = dpt.zeros(n, dtype=dtype)
-    zo = dpt.concat((o, z))
-    inp = dpt.roll(zo, 734)
-    k = 5
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_smallest_inds(oz, n, shift, k)
 
     s = dpt.top_k(inp, k, mode="smallest")
     assert s.values.shape == (k,)
     assert s.values.dtype == inp.dtype
     assert s.indices.shape == (k,)
-    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype))
-    assert dpt.all(s.values == inp[s.indices])
+    assert dpt.all(s.indices == expected_inds)
+    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices
 
 
 # triage failing top k radix implementation on CPU
 # replicates from Python behavior of radix sort topk implementation
-@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193])
-def test_topk_largest_1d_radix_i1_255(n):
+@pytest.mark.parametrize(
+    "n",
+    [
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        61,
+        137,
+        255,
+        511,
+        1021,
+        8193,
+    ],
+)
+def test_top_k_largest_1d_radix_i1(n):
     get_queue_or_skip()
     dt = "i1"
 
+    shift, k = 734, 5
     o = dpt.ones(n, dtype=dt)
     z = dpt.zeros(n, dtype=dt)
-    zo = dpt.concat((o, z))
-    inp = dpt.roll(zo, 734)
-    k = 5
-
-    sorted = dpt.copy(dpt.sort(inp, descending=True, kind="radixsort")[:k])
-    argsorted = dpt.copy(
-        dpt.argsort(inp, descending=True, kind="radixsort")[:k]
-    )
-    assert dpt.all(sorted == dpt.ones(k, dtype=dt))
-    assert dpt.all(sorted == inp[argsorted])
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_largest_inds(oz, n, shift, k)
+
+    sorted_v = dpt.sort(inp, descending=True, kind="radixsort")
+    argsorted = dpt.argsort(inp, descending=True, kind="radixsort")
+
+    assert dpt.all(sorted_v == inp[argsorted])
+
+    topk_vals = dpt.copy(sorted_v[:k])
+    topk_inds = dpt.copy(argsorted[:k])
+
+    assert dpt.all(topk_vals == dpt.ones(k, dtype=dt))
+    assert dpt.all(topk_inds == expected_inds)
+
+    assert dpt.all(topk_vals == inp[topk_inds]), topk_inds

From fde74083158d9ef50d5fa65e2e60ec65b6c0c0db Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 27 Dec 2024 08:56:21 -0600
Subject: [PATCH 14/15] Add comment to explain ceil_log2 algo

---
 dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 887841acb8..335b285bbf 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -75,6 +75,8 @@ std::uint32_t ceil_log2(SizeT n)
 
     std::uint32_t exp{1};
     --n;
+    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
+    // ceil_log2(q * 2^b + r) == ceil_log2(q * 2^b) == q + ceil_log2(n1)
     if (n >= (SizeT{1} << 32)) {
         n >>= 32;
         exp += 32;

From 4c8d7d768b4deb3a9a7702acc6fc8fab447cdfa4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 27 Dec 2024 08:57:01 -0600
Subject: [PATCH 15/15] Add static assers to async_smart_free

One asserts that at least one unique pointer is specified.
Another that specified arguments are unique pointers with
USMDeleter.
---
 .../include/utils/sycl_alloc_utils.hpp        | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
index 72440ef969..f67e1bba1f 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -30,6 +30,7 @@
 #include <iostream>
 #include <memory>
 #include <stdexcept>
+#include <type_traits>
 #include <vector>
 
 #include "sycl/sycl.hpp"
@@ -140,12 +141,50 @@ smart_malloc_host(std::size_t count,
     return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
 }
 
+namespace
+{
+template <typename T> struct valid_smart_ptr : public std::false_type
+{
+};
+
+template <typename ValT, typename DelT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DelT> &>
+    : public std::is_same<DelT, USMDeleter>
+{
+};
+
+template <typename ValT, typename DelT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DelT>>
+    : public std::is_same<DelT, USMDeleter>
+{
+};
+
+// base case
+template <typename... Rest> struct all_valid_smart_ptrs
+{
+    static constexpr bool value = true;
+};
+
+template <typename Arg, typename... RestArgs>
+struct all_valid_smart_ptrs<Arg, RestArgs...>
+{
+    static constexpr bool value = valid_smart_ptr<Arg>::value &&
+                                  (all_valid_smart_ptrs<RestArgs...>::value);
+};
+} // namespace
+
 template <typename... Args>
 sycl::event async_smart_free(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends,
                              Args &&...args)
 {
     constexpr std::size_t n = sizeof...(Args);
+    static_assert(
+        n > 0, "async_smart_free requires at least one smart pointer argument");
+
+    static_assert(
+        all_valid_smart_ptrs<Args...>::value,
+        "async_smart_free requires unique_ptr created with smart_malloc");
 
     std::vector<void *> ptrs;
     ptrs.reserve(n);