From b8482317e26c3ae0eb421eb4b5b95fe0d16a0743 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 31 Jul 2023 20:28:08 +0000 Subject: [PATCH 001/432] Introduce constructor for multi-GPU support. --- core/src/Cuda/Kokkos_Cuda.hpp | 2 ++ core/src/Cuda/Kokkos_Cuda_Instance.cpp | 45 +++++++++++++++----------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 4 +-- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 6c78a7984d0..b805a4464e1 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -183,6 +183,8 @@ class Cuda { Cuda(cudaStream_t stream, bool manage_stream = false); + Cuda(int device_id, cudaStream_t stream); + //-------------------------------------------------------------------------- //! Free any resources being consumed by the device. static void impl_finalize(); diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 0717bda55a3..79882e70346 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -101,17 +101,16 @@ int cuda_kernel_arch() { int arch = 0; int *d_arch = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_malloc_wrapper( - reinterpret_cast(&d_arch), sizeof(int)))); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - d_arch, &arch, sizeof(int), cudaMemcpyDefault))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&d_arch), sizeof(int))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault)); query_cuda_kernel_arch<<<1, 1>>>(d_arch); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - &arch, d_arch, sizeof(int), cudaMemcpyDefault))); KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_free_wrapper(d_arch))); + cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(d_arch)); return arch; } @@ -370,7 +369,8 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { +void CudaInternal::initialize(int cuda_device, cudaStream_t stream, + bool manage_stream) { if (was_finalized) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); was_initialized = true; @@ -387,6 +387,8 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags; + m_cudaDev = cuda_device; + if (ok_init) { //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -736,18 +738,16 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { const int cuda_device_id = Impl::get_gpu(settings); const auto &dev_info = Impl::CudaInternalDevices::singleton(); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + // Need device capability 3.0 or better const bool ok_dev = 3 <= dev_info.m_cudaProp[cuda_device_id].major && 0 <= dev_info.m_cudaProp[cuda_device_id].minor; if (ok_dev) { const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; - Impl::CudaInternal::m_cudaDev = cuda_device_id; Impl::CudaInternal::m_deviceProp = cudaProp; - Kokkos::Impl::cuda_device_synchronize( - "Kokkos::CudaInternal::initialize: Fence on space initialization"); - // Query what compute capability architecture a kernel executes: Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(); @@ -842,12 +842,10 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { } cudaStream_t singleton_stream; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_create_wrapper( - &singleton_stream))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); - auto &cuda_singleton = Impl::CudaInternal::singleton(); - cuda_singleton.initialize(singleton_stream, /*manage*/ true); + Impl::CudaInternal::singleton().initialize(cuda_device_id, singleton_stream, + /*manage*/ true); } std::vector Cuda::detect_device_arch() { @@ -893,7 +891,18 @@ Cuda::Cuda(cudaStream_t stream, bool manage_stream) }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); - m_space_instance->initialize(stream, manage_stream); + m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev, + stream, manage_stream); +} + +Cuda::Cuda(int device_id, cudaStream_t stream) + : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { + ptr->finalize(); + delete ptr; + }) { + Impl::CudaInternal::singleton().verify_is_initialized( + "Cuda instance constructor"); + m_space_instance->initialize(device_id, stream, /*manage_stream*/ false); } void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 61002e9df5e..173ed3233d0 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -102,7 +102,7 @@ class CudaInternal { public: using size_type = Cuda::size_type; - inline static int m_cudaDev = -1; + int m_cudaDev = -1; // Device Properties inline static int m_cudaArch = -1; @@ -159,7 +159,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(cudaStream_t stream, bool manage_stream); + void initialize(int cuda_devie, cudaStream_t stream, bool manage_stream); void finalize(); void print_configuration(std::ostream&) const; From 96b157b6c4bb6c7305a4226ef910109bbea336c9 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 21 Aug 2023 14:42:04 -0400 Subject: [PATCH 002/432] Add support for HIP Graph --- core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 166 ++++++++++++++++ core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp | 56 ++++++ core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 190 +++++++++++++++++++ core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 141 +++++++++++--- core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp | 3 +- core/src/HIP/Kokkos_HIP_Parallel_Range.hpp | 1 + core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 6 +- core/src/Kokkos_Graph.hpp | 1 + 8 files changed, 539 insertions(+), 25 deletions(-) create mode 100644 core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp create mode 100644 core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp create mode 100644 core/src/HIP/Kokkos_HIP_Graph_Impl.hpp diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp new file mode 100644 index 00000000000..76984a37729 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -0,0 +1,166 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_GRAPHNODEKERNEL_HPP +#define KOKKOS_HIP_GRAPHNODEKERNEL_HPP + +#include + +#include +#include + +#include +#include +#include + +#include + +namespace Kokkos { +namespace Impl { + +template +struct PatternImplSpecializationFromTag< + Kokkos::ParallelReduceTag, CombinedFunctorReducer, PolicyType, Kokkos::HIP> + : type_identity< + ParallelReduce> {}; + +template +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { + public: + using Policy = PolicyType; + using graph_kernel = GraphNodeKernelImpl; + using base_t = + typename PatternImplSpecializationFromTag::type; + using Record = Kokkos::Impl::SharedAllocationRecord; + + // TODO use the name and executionspace + template + GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, + (ArgsDeduced &&) args...) {} + + template + GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor, + PolicyDeduced&& arg_policy) + : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), + (PolicyDeduced &&) arg_policy) {} + + ~GraphNodeKernelImpl() { + if (m_driver_storage) { + Record::decrement(Record::get_record(m_driver_storage)); + } + } + + void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) { + m_graph_ptr = arg_graph_ptr; + } + + void set_hip_graph_node_ptr(hipGraphNode_t* arg_node_ptr) { + m_graph_node_ptr = arg_node_ptr; + } + + hipGraphNode_t* get_hip_graph_node_ptr() const { return m_graph_node_ptr; } + + hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; } + + Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + KOKKOS_EXPECTS(m_driver_storage == nullptr); + + auto* record = Record::allocate( + Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage", + sizeof(base_t)); + + Record::increment(record); + m_driver_storage = reinterpret_cast(record->data()); + KOKKOS_ENSURES(m_driver_storage != nullptr); + + return m_driver_storage; + } + + private: + Kokkos::ObservingRawPtr m_graph_ptr = nullptr; + Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; + Kokkos::OwningRawPtr m_driver_storage = nullptr; +}; + +struct HIPGraphNodeAggregateKernel { + using graph_kernel = HIPGraphNodeAggregateKernel; + + // Aggregates don't need a policy, but for the purposes of checking the static + // assertions about graph kerenls, + struct Policy { + using is_graph_kernel = std::true_type; + }; +}; + +template ::type> +struct get_graph_node_kernel_type + : type_identity< + GraphNodeKernelImpl> {}; + +template +struct get_graph_node_kernel_type + : type_identity, + Kokkos::ParallelReduceTag>> {}; + +template +auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + + return kernel_as_graph_kernel.allocate_driver_memory_buffer(); +} + +template +auto const& get_hip_graph_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + hipGraph_t const* graph_ptr = kernel_as_graph_kernel.get_hip_graph_ptr(); + KOKKOS_EXPECTS(graph_ptr != nullptr); + + return *graph_ptr; +} + +template +auto& get_hip_graph_node_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type::type; + auto const& kernel_as_graph_kernel = + static_cast(kernel); + auto* graph_node_ptr = kernel_as_graph_kernel.get_hip_graph_node_ptr(); + KOKKOS_EXPECTS(graph_node_ptr != nullptr); + + return *graph_node_ptr; +} +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp b/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp new file mode 100644 index 00000000000..4f972b02be3 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_GRAPHNODE_IMPL_HPP +#define KOKKOS_HIP_GRAPHNODE_IMPL_HPP + +#include + +#include + +#include + +namespace Kokkos { +namespace Impl { +template <> +struct GraphNodeBackendSpecificDetails { + hipGraphNode_t node = nullptr; + + explicit GraphNodeBackendSpecificDetails() = default; + + explicit GraphNodeBackendSpecificDetails( + _graph_node_is_root_ctor_tag) noexcept {} +}; + +template +struct GraphNodeBackendDetailsBeforeTypeErasure { + protected: + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::HIP const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails &) noexcept {} + + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::HIP const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails &) noexcept {} +}; + +} // namespace Impl +} // namespace Kokkos + +#include + +#endif diff --git a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp new file mode 100644 index 00000000000..ca887c8ca96 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -0,0 +1,190 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_GRAPH_IMPL_HPP +#define KOKKOS_HIP_GRAPH_IMPL_HPP + +#include + +#if defined(KOKKOS_ENABLE_HIP) + +#include + +#include + +#include +#include + +namespace Kokkos { +namespace Impl { +template <> +class GraphImpl { + public: + using node_details_t = GraphNodeBackendSpecificDetails; + using root_node_impl_t = + GraphNodeImpl; + using aggregate_kernel_impl_t = HIPGraphNodeAggregateKernel; + using aggregate_node_impl_t = + GraphNodeImpl; + + // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Graph object. + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl const&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; + + ~GraphImpl(); + + explicit GraphImpl(Kokkos::HIP instance); + + void add_node(std::shared_ptr const& arg_node_ptr); + + template + void add_node(std::shared_ptr const& arg_node_ptr); + + template + void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); + + void submit(); + + Kokkos::HIP const& get_execution_space() const noexcept; + + auto create_root_node_ptr(); + + template + auto create_aggregate_ptr(PredecessorRefs&&...); + + private: + void instantiate_graph() { + constexpr size_t error_log_size = 256; + hipGraphNode_t error_node = nullptr; + char error_log[error_log_size]; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate( + &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); + } + + Kokkos::HIP m_execution_space; + hipGraph_t m_graph = nullptr; + hipGraphExec_t m_graph_exec = nullptr; +}; + +GraphImpl::~GraphImpl() { + m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); + KOKKOS_EXPECTS(m_graph); + if (m_graph_exec) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphExecDestroy(m_graph_exec)); + } + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph)); +} + +GraphImpl::GraphImpl(Kokkos::HIP instance) + : m_execution_space(std::move(instance)) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0)); +} + +void GraphImpl::add_node( + std::shared_ptr const& arg_node_ptr) { + // All of the predecessors are just added as normal, so all we need to + // do here is add an empty node + KOKKOS_IMPL_HIP_SAFE_CALL( + hipGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); +} + +// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl +// Also requires that the kernel has the graph node tag in it's policy +template +void GraphImpl::add_node( + std::shared_ptr const& arg_node_ptr) { + static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); + KOKKOS_EXPECTS(arg_node_ptr); + // The Kernel launch from the execute() method has been shimmed to insert + // the node into the graph + auto& kernel = arg_node_ptr->get_kernel(); + auto& node = static_cast(arg_node_ptr.get())->node; + KOKKOS_EXPECTS(!node); + kernel.set_hip_graph_ptr(&m_graph); + kernel.set_hip_graph_node_ptr(&node); + kernel.execute(); + KOKKOS_ENSURES(node); +} + +// Requires PredecessorRef is a specialization of GraphNodeRef that has +// already been added to this graph and NodeImpl is a specialization of +// GraphNodeImpl that has already been added to this graph. +template +void GraphImpl::add_predecessor(NodeImplPtr arg_node_ptr, + PredecessorRef arg_pred_ref) { + KOKKOS_EXPECTS(arg_node_ptr); + auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); + KOKKOS_EXPECTS(pred_ptr); + + auto const& pred_node = pred_ptr->node_details_t::node; + KOKKOS_EXPECTS(pred_node); + + auto const& node = arg_node_ptr->node_details_t::node; + KOKKOS_EXPECTS(node); + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); +} + +void GraphImpl::submit() { + if (!m_graph_exec) { + instantiate_graph(); + } + KOKKOS_IMPL_HIP_SAFE_CALL( + hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); +} + +Kokkos::HIP const& GraphImpl::get_execution_space() const + noexcept { + return m_execution_space; +} + +auto GraphImpl::create_root_node_ptr() { + KOKKOS_EXPECTS(m_graph); + KOKKOS_EXPECTS(!m_graph_exec); + auto rv = std::make_shared(get_execution_space(), + _graph_node_is_root_ctor_tag{}); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddEmptyNode(&(rv->node_details_t::node), + m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + KOKKOS_ENSURES(rv->node_details_t::node); + return rv; +} + +template +auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { + // The attachment to predecessors, which is all we really need, happens + // in the generic layer, which calls through to add_predecessor for + // each predecessor ref, so all we need to do here is create the (trivial) + // aggregate node. + return std::make_shared(m_execution_space, + _graph_node_kernel_ctor_tag{}, + aggregate_kernel_impl_t{}); +} +} // namespace Impl +} // namespace Kokkos + +#endif +#endif diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 0bc3529530d..7c4e584eec0 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -24,6 +24,8 @@ #include #include #include +#include +#include // Must use global variable on the device with HIP-Clang #ifdef __HIP__ @@ -193,6 +195,13 @@ struct HIPParallelLaunchKernelFuncData { } }; +//---------------------------------------------------------------// +// Helper function // +//---------------------------------------------------------------// +inline bool is_empty_launch(dim3 const &grid, dim3 const &block) { + return (grid.x == 0) || ((block.x * block.y * block.z) == 0); +} + //---------------------------------------------------------------// // HIPParallelLaunchKernelFunc structure and its specializations // //---------------------------------------------------------------// @@ -367,6 +376,40 @@ struct HIPParallelLaunchKernelInvokerm_stream>>>( driver); } + + static void create_parallel_launch_graph_node( + DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, + HIPInternal const * /*hip_instance*/) { + auto const &graph = Impl::get_hip_graph_from_kernel(driver); + KOKKOS_EXPECTS(graph); + auto &graph_node = Impl::get_hip_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!graph_node); + + if (!Impl::is_empty_launch(grid, block)) { + void const *args[] = {&driver}; + + hipKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void *)base_t::get_kernel_func(); + params.kernelParams = (void **)args; + params.extra = nullptr; + + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + KOKKOS_IMPL_HIP_SAFE_CALL( + hipGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + KOKKOS_ENSURES(graph_node); + } }; // HIPLaunchMechanism::GlobalMemory specialization @@ -389,6 +432,50 @@ struct HIPParallelLaunchKernelInvokerm_stream>>>( driver_ptr); } + + static void create_parallel_launch_graph_node( + DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, + HIPInternal const *hip_instance) { + auto const &graph = Impl::get_hip_graph_from_kernel(driver); + KOKKOS_EXPECTS(graph); + auto &graph_node = Impl::get_hip_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!graph_node); + + if (!Impl::is_empty_launch(grid, block)) { + auto *driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + + // Unlike in the non-graph case, we can get away with doing an async copy + // here because the `DriverType` instance is held in the GraphNodeImpl + // which is guaranteed to be alive until the graph instance itself is + // destroyed, where there should be a fence ensuring that the allocation + // associated with this kernel on the device side isn't deleted. + hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, + hip_instance->m_stream); + + void const *args[] = {&driver_ptr}; + + hipKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void *)base_t::get_kernel_func(); + params.kernelParams = (void **)args; + params.extra = nullptr; + + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + KOKKOS_IMPL_HIP_SAFE_CALL( + hipGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + KOKKOS_ENSURES(bool(graph_node)) + } }; // HIPLaunchMechanism::ConstantMemory specializations @@ -481,38 +568,48 @@ struct HIPParallelLaunch< // al. template , HIPLaunchMechanism LaunchMechanism = - DeduceHIPLaunchMechanism::launch_mechanism> + DeduceHIPLaunchMechanism::launch_mechanism, + bool DoGraph = DriverType::Policy::is_graph_kernel::value> void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { + if constexpr (DoGraph) { + // Graph launch + using base_t = HIPParallelLaunchKernelInvoker; + base_t::create_parallel_launch_graph_node(driver, grid, block, shmem, + hip_instance); + } else { + // Regular kernel launch #ifndef KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS - HIPParallelLaunch( - driver, grid, block, shmem, hip_instance, prefer_shmem); -#else - if constexpr (!HIPParallelLaunch::default_launchbounds()) { - // for user defined, we *always* honor the request HIPParallelLaunch( driver, grid, block, shmem, hip_instance, prefer_shmem); - } else { - // we can do what we like - const unsigned flat_block_size = block.x * block.y * block.z; - if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) { - // we have to use the large blocksize - HIPParallelLaunch< - DriverType, - Kokkos::LaunchBounds, - LaunchMechanism>(driver, grid, block, shmem, hip_instance, - prefer_shmem); +#else + if constexpr (!HIPParallelLaunch::default_launchbounds()) { + // for user defined, we *always* honor the request + HIPParallelLaunch( + driver, grid, block, shmem, hip_instance, prefer_shmem); } else { - HIPParallelLaunch, - LaunchMechanism>(driver, grid, block, shmem, - hip_instance, prefer_shmem); + // we can do what we like + const unsigned flat_block_size = block.x * block.y * block.z; + if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) { + // we have to use the large blocksize + HIPParallelLaunch< + DriverType, + Kokkos::LaunchBounds, + LaunchMechanism>(driver, grid, block, shmem, hip_instance, + prefer_shmem); + } else { + HIPParallelLaunch< + DriverType, Kokkos::LaunchBounds, + LaunchMechanism>(driver, grid, block, shmem, hip_instance, + prefer_shmem); + } } - } #endif + } } } // namespace Impl } // namespace Kokkos diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index 1478fbee609..0fa325cb12c 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -30,7 +30,8 @@ namespace Impl { template class ParallelFor, HIP> { public: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using functor_type = FunctorType; private: using array_index_type = typename Policy::array_index_type; diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index f1166ba5ee8..df27e8615b6 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -121,6 +121,7 @@ class ParallelReduce, using value_type = typename ReducerType::value_type; using reference_type = typename ReducerType::reference_type; using functor_type = FunctorType; + using reducer_type = ReducerType; using size_type = Kokkos::HIP::size_type; using index_type = typename Policy::index_type; // Conditionally set word_size_type to int16_t or int8_t if value_type is diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index d67371527ff..a8420883040 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -425,7 +425,7 @@ __device__ inline void hip_release_scratch_index(int32_t* scratch_locks, template class ParallelFor, HIP> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using functor_type = FunctorType; using size_type = HIP::size_type; @@ -587,7 +587,8 @@ class ParallelReduce::value; if (!is_empty_range || need_device_set) { const int block_count = diff --git a/core/src/Kokkos_Graph.hpp b/core/src/Kokkos_Graph.hpp index 7f77c00b2d7..b4dae691642 100644 --- a/core/src/Kokkos_Graph.hpp +++ b/core/src/Kokkos_Graph.hpp @@ -161,6 +161,7 @@ Graph create_graph(Closure&& arg_closure) { #include #include #include +#include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH From 277e4a6149bf70b4b0703819e8dd217eb8efd564 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 23 Aug 2023 10:57:14 -0400 Subject: [PATCH 003/432] Do not use HIP Graph with ROCm 5.2 --- core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 2 ++ core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index ca887c8ca96..b3a77e86ee3 100644 --- a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -20,6 +20,7 @@ #include #if defined(KOKKOS_ENABLE_HIP) +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR > 2) #include @@ -188,3 +189,4 @@ auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { #endif #endif +#endif diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 7c4e584eec0..ce8fcafd753 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -24,8 +24,11 @@ #include #include #include + +#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR > 2) #include #include +#endif // Must use global variable on the device with HIP-Clang #ifdef __HIP__ @@ -377,6 +380,7 @@ struct HIPParallelLaunchKernelInvoker 2) static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const * /*hip_instance*/) { @@ -410,6 +414,7 @@ struct HIPParallelLaunchKernelInvoker 2) static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const *hip_instance) { @@ -476,6 +482,7 @@ struct HIPParallelLaunchKernelInvoker 2) if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker; base_t::create_parallel_launch_graph_node(driver, grid, block, shmem, hip_instance); - } else { + } else +#endif + { // Regular kernel launch #ifndef KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS HIPParallelLaunch( From 64a9b3d854521b7b6dabf7fd9e162c959045b2e3 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 25 Aug 2023 16:39:11 -0400 Subject: [PATCH 004/432] Fix typo Co-authored-by: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> --- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 173ed3233d0..d92f9c50e45 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -159,7 +159,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(int cuda_devie, cudaStream_t stream, bool manage_stream); + void initialize(int cuda_device, cudaStream_t stream, bool manage_stream); void finalize(); void print_configuration(std::ostream&) const; From 9fde326a373bbb5dde036ce67fe8fb65a56a431d Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 28 Aug 2023 10:05:49 -0400 Subject: [PATCH 005/432] Fix reviewer's comments --- core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 8 +------- core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 5 ----- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 8 ++++---- core/src/Kokkos_Graph.hpp | 4 ++++ 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 76984a37729..e83c041f764 100644 --- a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -31,12 +31,6 @@ namespace Kokkos { namespace Impl { -template -struct PatternImplSpecializationFromTag< - Kokkos::ParallelReduceTag, CombinedFunctorReducer, PolicyType, Kokkos::HIP> - : type_identity< - ParallelReduce> {}; - template class GraphNodeKernelImpl @@ -105,7 +99,7 @@ struct HIPGraphNodeAggregateKernel { using graph_kernel = HIPGraphNodeAggregateKernel; // Aggregates don't need a policy, but for the purposes of checking the static - // assertions about graph kerenls, + // assertions about graph kernels, struct Policy { using is_graph_kernel = std::true_type; }; diff --git a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index b3a77e86ee3..34b69d061a9 100644 --- a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -19,9 +19,6 @@ #include -#if defined(KOKKOS_ENABLE_HIP) -#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR > 2) - #include #include @@ -188,5 +185,3 @@ auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { } // namespace Kokkos #endif -#endif -#endif diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index ce8fcafd753..4b237625fbd 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -25,7 +25,7 @@ #include #include -#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR > 2) +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) #include #include #endif @@ -380,7 +380,7 @@ struct HIPParallelLaunchKernelInvoker 2) +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const * /*hip_instance*/) { @@ -438,7 +438,7 @@ struct HIPParallelLaunchKernelInvoker 2) +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const *hip_instance) { @@ -581,7 +581,7 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { -#if (HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR > 2) +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker create_graph(Closure&& arg_closure) { #include #include #include +#if defined(KOKKOS_ENABLE_HIP) +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) #include +#endif +#endif #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH From e627b2e59f61af464d4eb88e251de04d1f3b386e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 28 Aug 2023 17:52:37 +0000 Subject: [PATCH 006/432] Add test --- core/unit_test/CMakeLists.txt | 6 ++++ .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 34 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 41f2367b07f..c1bd958925e 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -763,6 +763,12 @@ if(Kokkos_ENABLE_CUDA) UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_CudaInterOpStreamsMultiGPU + SOURCES + UnitTestMain.cpp + cuda/TestCuda_InterOp_StreamsMultiGPU.cpp + ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_CudaGraph SOURCES diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 00000000000..cc00d27b61f --- /dev/null +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,34 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include + +namespace Test { +// Test Interoperability with Cuda Streams and muliple GPUs. +TEST(cuda, raw_cuda_streams) { + Kokkos::ScopeGuard scope_guard; + + cudaStream_t stream; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream)); + { + TEST_EXECSPACE cuda_instance(TEST_EXECSPACE().cuda_device(), stream); + ASSERT_EQ(cuda_instance.cuda_device(), TEST_EXECSPACE().cuda_device()); + } + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream)); +} +} // namespace Test From 1683786110806f22fb14b2c9e065ee8350cec8ac Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 28 Aug 2023 19:47:52 -0400 Subject: [PATCH 007/432] Fix typo. Co-authored-by: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> --- core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp index cc00d27b61f..a3c71315700 100644 --- a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -19,7 +19,7 @@ #include namespace Test { -// Test Interoperability with Cuda Streams and muliple GPUs. +// Test Interoperability with Cuda Streams and multiple GPUs. TEST(cuda, raw_cuda_streams) { Kokkos::ScopeGuard scope_guard; From fa1aaa712a36394c5c1f1508771cbf13160353b7 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 29 Aug 2023 16:29:04 +0000 Subject: [PATCH 008/432] Explicitly check for valid device id --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 79882e70346..20db9deabd9 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -902,6 +902,14 @@ Cuda::Cuda(int device_id, cudaStream_t stream) }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); + const int n_devices = Kokkos::Cuda::detect_device_count(); + if (device_id < 0 || device_id >= n_devices) { + std::stringstream ss; + ss << "Error: Requested GPU with invalid id '" << device_id << "'." + << " The device id must be in the interval [0, " << n_devices << ")!" + << " Raised by Kokkos::Cuda::Cuda().\n"; + Kokkos::abort(ss.str().c_str()); + } m_space_instance->initialize(device_id, stream, /*manage_stream*/ false); } From 035d2848709cde0f3d53774e023a679fcd4853ae Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 8 Sep 2023 10:05:27 -0400 Subject: [PATCH 009/432] Address reviewer' comments --- core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 1 + core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp | 2 -- core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 4 ++-- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 20 +++++++++++++------- core/src/Kokkos_Graph.hpp | 1 + 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index e83c041f764..576c53426bc 100644 --- a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -27,6 +27,7 @@ #include #include +#include namespace Kokkos { namespace Impl { diff --git a/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp b/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp index 4f972b02be3..819ee12f396 100644 --- a/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp +++ b/core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp @@ -51,6 +51,4 @@ struct GraphNodeBackendDetailsBeforeTypeErasure - #endif diff --git a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index 34b69d061a9..3bde15444c7 100644 --- a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -22,9 +22,9 @@ #include #include - #include -#include + +#include namespace Kokkos { namespace Impl { diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 4b237625fbd..5aa34feea22 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -26,6 +26,10 @@ #include #if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#define KOKKOS_IMPL_HIP_GRAPH_ENABLED +#endif + +#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED #include #include #endif @@ -380,13 +384,13 @@ struct HIPParallelLaunchKernelInvoker create_graph(Closure&& arg_closure) { #include #include #if defined(KOKKOS_ENABLE_HIP) +// The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. #if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) #include #endif From 41253bd55d7fad8c72f5f3756792705b66bdc954 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 11 Sep 2023 20:51:50 +0000 Subject: [PATCH 010/432] Set the device id in cuda_kernel_arch --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index e0fe4b109a1..2ed3608fda2 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -97,10 +97,11 @@ __global__ void query_cuda_kernel_arch(int *d_arch) { } /** Query what compute capability is actually launched to the device: */ -int cuda_kernel_arch() { +int cuda_kernel_arch(int cuda_device) { int arch = 0; int *d_arch = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMalloc(reinterpret_cast(&d_arch), sizeof(int))); KOKKOS_IMPL_CUDA_SAFE_CALL( @@ -692,14 +693,12 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { const int cuda_device_id = Impl::get_gpu(settings); const auto &dev_info = Impl::CudaInternalDevices::singleton(); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); - const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; Impl::CudaInternal::m_deviceProp = cudaProp; // Query what compute capability architecture a kernel executes: - Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(); + Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); if (Impl::CudaInternal::m_cudaArch == 0) { std::stringstream ss; From 89a42341c9d0366e50152c888a0c8e933b9472f9 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 11 Sep 2023 14:07:27 -0700 Subject: [PATCH 011/432] [SYCL][Reduction] Group counter should use at least memory_order::acq_rel From https://en.cppreference.com/w/cpp/atomic/memory_order: > Atomic operations tagged memory_order_relaxed are not synchronization > operations; they do not impose an order among concurrent memory > accesses. They only guarantee atomicity and modification order > consistency. Yet we want to use that counter exactly for the synchronization purposes - to decide which WG finished last and should perform the final step. The same issue had been fixed in DPC++ at https://github.com/intel/llvm/pull/8058. --- core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp | 4 ++-- core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 4 ++-- core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 8a4949c5c4e..6964c2dbcf0 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -215,7 +215,7 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); @@ -260,7 +260,7 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index d343b126c6d..8c900cfa428 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -160,7 +160,7 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); @@ -202,7 +202,7 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 9beb62a38b2..07145b0fb93 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -208,7 +208,7 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); @@ -260,7 +260,7 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); From fea838822b075c2b4996c1f5e40e3795222e3956 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 11 Sep 2023 15:26:56 -0700 Subject: [PATCH 012/432] Same for scan --- core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 3a176154181..04425723e19 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -179,7 +179,7 @@ class ParallelScanSYCLBase { group_results[item.get_group_linear_id()] = local_mem[item.get_sub_group().get_group_range()[0] - 1]; - sycl::atomic_ref scratch_flags_ref(*scratch_flags); From b85563160c354e4c6f1a0e145beab6cd8ed6dd7c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 15 Sep 2023 13:52:40 -0400 Subject: [PATCH 013/432] SIMD: Math functions should be in namespace Kokkos --- simd/src/Kokkos_SIMD_AVX2.hpp | 209 ++++++++++------ simd/src/Kokkos_SIMD_AVX512.hpp | 258 +++++++++++++------- simd/src/Kokkos_SIMD_Common.hpp | 150 +++++++++--- simd/src/Kokkos_SIMD_NEON.hpp | 68 ++++-- simd/src/Kokkos_SIMD_Scalar.hpp | 43 ++-- simd/unit_tests/include/SIMDTesting_Ops.hpp | 18 +- 6 files changed, 505 insertions(+), 241 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index ba3839c38db..e92dd83f1e1 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -629,82 +629,100 @@ class simd> { } }; +} // namespace Experimental + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> copysign( - simd> const& a, - simd> const& b) { +Experimental::simd> copysign( + Experimental::simd> const& a, + Experimental::simd> const& b) { __m256d const sign_mask = _mm256_set1_pd(-0.0); - return simd>( + return Experimental::simd>( _mm256_xor_pd(_mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)), _mm256_and_pd(sign_mask, static_cast<__m256d>(b)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { +Experimental::simd> abs( + Experimental::simd> const& a) { __m256d const sign_mask = _mm256_set1_pd(-0.0); - return simd>( + return Experimental::simd>( _mm256_andnot_pd(sign_mask, static_cast<__m256d>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> sqrt( - simd> const& a) { - return simd>( +Experimental::simd> sqrt( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_sqrt_pd(static_cast<__m256d>(a))); } #ifdef __INTEL_COMPILER KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> cbrt( - simd> const& a) { - return simd>( +Experimental::simd> cbrt( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_cbrt_pd(static_cast<__m256d>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> exp( - simd> const& a) { - return simd>( +Experimental::simd> exp( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_exp_pd(static_cast<__m256d>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> log( - simd> const& a) { - return simd>( +Experimental::simd> log( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_log_pd(static_cast<__m256d>(a))); } #endif KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> fma( - simd> const& a, - simd> const& b, - simd> const& c) { - return simd>( +Experimental::simd> fma( + Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd> const& c) { + return Experimental::simd>( _mm256_fmadd_pd(static_cast<__m256d>(a), static_cast<__m256d>(b), static_cast<__m256d>(c))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> max( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> max( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( _mm256_max_pd(static_cast<__m256d>(a), static_cast<__m256d>(b))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> min( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> min( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( _mm256_min_pd(static_cast<__m256d>(a), static_cast<__m256d>(b))); } +namespace Experimental { + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition( simd_mask> const& a, @@ -814,81 +832,100 @@ class simd> { } }; +} // namespace Experimental + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> copysign( - simd> const& a, - simd> const& b) { +Experimental::simd> copysign( + Experimental::simd> const& + a, + Experimental::simd> const& + b) { __m128 const sign_mask = _mm_set1_ps(-0.0); - return simd>( + return Experimental::simd>( _mm_xor_ps(_mm_andnot_ps(sign_mask, static_cast<__m128>(a)), _mm_and_ps(sign_mask, static_cast<__m128>(b)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { +Experimental::simd> abs( + Experimental::simd> const& + a) { __m128 const sign_mask = _mm_set1_ps(-0.0); - return simd>( + return Experimental::simd>( _mm_andnot_ps(sign_mask, static_cast<__m128>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> sqrt( - simd> const& a) { - return simd>( +Experimental::simd> sqrt( + Experimental::simd> const& + a) { + return Experimental::simd>( _mm_sqrt_ps(static_cast<__m128>(a))); } #ifdef __INTEL_COMPILER KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> cbrt( - simd> const& a) { - return simd>( +Experimental::simd> cbrt( + Experimental::simd> const& + a) { + return Experimental::simd>( _mm_cbrt_ps(static_cast<__m128>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> exp( - simd> const& a) { - return simd>( +Experimental::simd> exp( + Experimental::simd> const& + a) { + return Experimental::simd>( _mm_exp_ps(static_cast<__m128>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> log( - simd> const& a) { - return simd>( +Experimental::simd> log( + Experimental::simd> const& + a) { + return Experimental::simd>( _mm_log_ps(static_cast<__m128>(a))); } #endif KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> fma( - simd> const& a, - simd> const& b, - simd> const& c) { - return simd>(_mm_fmadd_ps( - static_cast<__m128>(a), static_cast<__m128>(b), static_cast<__m128>(c))); +Experimental::simd> fma( + Experimental::simd> const& + a, + Experimental::simd> const& + b, + Experimental::simd> const& + c) { + return Experimental::simd>( + _mm_fmadd_ps(static_cast<__m128>(a), static_cast<__m128>(b), + static_cast<__m128>(c))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> max( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> max( + Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( _mm_max_ps(static_cast<__m128>(a), static_cast<__m128>(b))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> min( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> min( + Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( _mm_min_ps(static_cast<__m128>(a), static_cast<__m128>(b))); } +namespace Experimental { + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition( simd_mask> const& a, @@ -1021,13 +1058,20 @@ class simd> { } }; +} // namespace Experimental + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { +Experimental::simd> +abs(Experimental::simd> const& a) { __m128i const rhs = static_cast<__m128i>(a); - return simd>(_mm_abs_epi32(rhs)); + return Experimental::simd>( + _mm_abs_epi32(rhs)); } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -1177,15 +1221,21 @@ class simd> { } }; +} // namespace Experimental + // Manually computing absolute values, because _mm256_abs_epi64 // is not in AVX2; it's available in AVX512. [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - abs(simd> const& a) { - return simd>( + Experimental::simd> + abs(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( [&](std::size_t i) { return (a[i] < 0) ? -a[i] : a[i]; }); } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -1313,12 +1363,6 @@ simd>::simd( simd> const& other) : m_value(static_cast<__m256i>(other)) {} -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { - return a; -} - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -1338,6 +1382,17 @@ simd>::simd( } } +} // namespace Experimental + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> +abs(Experimental::simd> const& a) { + return a; +} + +namespace Experimental { + template <> class const_where_expression>, simd>> { diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index 575e845c515..75b62b94d63 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -272,14 +272,20 @@ class simd> { } }; -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - abs(simd> const& a) { +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> +abs(Experimental::simd> const& a) { __m256i const rhs = static_cast<__m256i>(a); - return simd>( + return Experimental::simd>( _mm256_abs_epi32(rhs)); } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -425,12 +431,17 @@ class simd> { } }; -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - abs(simd> const& a) { +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> +abs(Experimental::simd> const& a) { return a; } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -580,14 +591,20 @@ class simd> { } }; -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - abs(simd> const& a) { +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> +abs(Experimental::simd> const& a) { __m512i const rhs = static_cast<__m512i>(a); - return simd>( + return Experimental::simd>( _mm512_abs_epi64(rhs)); } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -742,12 +759,17 @@ class simd> { } }; -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - abs(simd> const& a) { +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> +abs(Experimental::simd> const& a) { return a; } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -886,13 +908,21 @@ class simd> { } }; -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - copysign(simd> const& a, - simd> const& b) { - static const __m512i sign_mask = reinterpret_cast<__m512i>( - static_cast<__m512d>(simd>(-0.0))); - return simd>( +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +copysign( + Experimental::simd> const& a, + Experimental::simd> const& b) { + static const __m512i sign_mask = + reinterpret_cast<__m512i>(static_cast<__m512d>( + Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>>(-0.0))); + return Experimental::simd>( reinterpret_cast<__m512d>(_mm512_xor_epi64( _mm512_andnot_epi64( sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(a))), @@ -901,75 +931,100 @@ class simd> { } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - abs(simd> const& a) { + Experimental::simd> + abs(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { __m512d const rhs = static_cast<__m512d>(a); #if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 830) - return simd>((__m512d)_mm512_and_epi64( - (__m512i)rhs, _mm512_set1_epi64(0x7fffffffffffffffLL))); + return Experimental::simd>( + (__m512d)_mm512_and_epi64((__m512i)rhs, + _mm512_set1_epi64(0x7fffffffffffffffLL))); #else - return simd>(_mm512_abs_pd(rhs)); + return Experimental::simd>( + _mm512_abs_pd(rhs)); #endif } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - sqrt(simd> const& a) { - return simd>( + Experimental::simd> + sqrt(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( _mm512_sqrt_pd(static_cast<__m512d>(a))); } #ifdef __INTEL_COMPILER [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - cbrt(simd> const& a) { - return simd>( + Experimental::simd> + cbrt(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( _mm512_cbrt_pd(static_cast<__m512d>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - exp(simd> const& a) { - return simd>( + Experimental::simd> + exp(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( _mm512_exp_pd(static_cast<__m512d>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - log(simd> const& a) { - return simd>( + Experimental::simd> + log(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( _mm512_log_pd(static_cast<__m512d>(a))); } #endif -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - fma(simd> const& a, - simd> const& b, - simd> const& c) { - return simd>( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +fma(Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd> const& c) { + return Experimental::simd>( _mm512_fmadd_pd(static_cast<__m512d>(a), static_cast<__m512d>(b), static_cast<__m512d>(c))); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - max(simd> const& a, - simd> const& b) { - return simd>( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +max(Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( _mm512_max_pd(static_cast<__m512d>(a), static_cast<__m512d>(b))); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - min(simd> const& a, - simd> const& b) { - return simd>( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +min(Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( _mm512_min_pd(static_cast<__m512d>(a), static_cast<__m512d>(b))); } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -1085,81 +1140,110 @@ class simd> { } }; +} // namespace Experimental + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> copysign( - simd> const& a, - simd> const& b) { +Experimental::simd> +copysign( + Experimental::simd> const& a, + Experimental::simd> const& b) { __m256 const sign_mask = _mm256_set1_ps(-0.0); - return simd>( + return Experimental::simd>( _mm256_xor_ps(_mm256_andnot_ps(sign_mask, static_cast<__m256>(a)), _mm256_and_ps(sign_mask, static_cast<__m256>(b)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { +Experimental::simd> abs( + Experimental::simd> const& a) { __m256 const sign_mask = _mm256_set1_ps(-0.0); - return simd>( + return Experimental::simd>( _mm256_andnot_ps(sign_mask, static_cast<__m256>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> sqrt( - simd> const& a) { - return simd>( +Experimental::simd> sqrt( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_sqrt_ps(static_cast<__m256>(a))); } #ifdef __INTEL_COMPILER KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> cbrt( - simd> const& a) { - return simd>( +Experimental::simd> cbrt( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_cbrt_ps(static_cast<__m256>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> exp( - simd> const& a) { - return simd>( +Experimental::simd> exp( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_exp_ps(static_cast<__m256>(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> log( - simd> const& a) { - return simd>( +Experimental::simd> log( + Experimental::simd> const& a) { + return Experimental::simd>( _mm256_log_ps(static_cast<__m256>(a))); } #endif KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> fma( - simd> const& a, - simd> const& b, - simd> const& c) { - return simd>(_mm256_fmadd_ps( - static_cast<__m256>(a), static_cast<__m256>(b), static_cast<__m256>(c))); +Experimental::simd> fma( + Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd> const& c) { + return Experimental::simd>( + _mm256_fmadd_ps(static_cast<__m256>(a), static_cast<__m256>(b), + static_cast<__m256>(c))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> max( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> max( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( _mm256_max_ps(static_cast<__m256>(a), static_cast<__m256>(b))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> min( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> min( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( _mm256_min_ps(static_cast<__m256>(a), static_cast<__m256>(b))); } +namespace Experimental { + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition( simd_mask> const& a, diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index b0559696a8b..c6fd16ca686 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -378,6 +378,18 @@ template return result; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Experimental { +template +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION + Experimental::simd + min(Experimental::simd const& a, + Experimental::simd const& b) { + return Kokkos::min(a, b); +} +} // namespace Experimental +#endif + template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd max( Experimental::simd const& a, Experimental::simd const& b) { @@ -388,23 +400,54 @@ template return result; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Experimental { +template +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION + Experimental::simd + max(Experimental::simd const& a, + Experimental::simd const& b) { + return Kokkos::max(a, b); +} +} // namespace Experimental +#endif + // fallback implementations of functions. // individual Abi types may provide overloads with more efficient // implementations. // These are not in the Experimental namespace because their double // overloads are not either -#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd \ - FUNC(Experimental::simd const& a) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); \ - ++i) { \ - result[i] = Kokkos::FUNC(a[i]); \ - } \ - return result; \ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i]); \ + } \ + return result; \ + } \ + namespace Experimental { \ + template \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION simd \ + FUNC(simd const& a) { \ + return Kokkos::FUNC(a); \ + } \ } +#else +#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i]); \ + } \ + return result; \ + } +#endif KOKKOS_IMPL_SIMD_UNARY_FUNCTION(abs) KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp) @@ -431,37 +474,78 @@ KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erfc) KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tgamma) KOKKOS_IMPL_SIMD_UNARY_FUNCTION(lgamma) -#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd \ - FUNC(Experimental::simd const& a, \ - Experimental::simd const& b) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); \ - ++i) { \ - result[i] = Kokkos::FUNC(a[i], b[i]); \ - } \ - return result; \ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i]); \ + } \ + return result; \ + } \ + namespace Experimental { \ + template \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION simd \ + FUNC(simd const& a, simd const& b) { \ + Kokkos::FUNC(a, b); \ + } \ } +#else +#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i]); \ + } \ + return result; \ + } +#endif KOKKOS_IMPL_SIMD_BINARY_FUNCTION(pow) KOKKOS_IMPL_SIMD_BINARY_FUNCTION(hypot) KOKKOS_IMPL_SIMD_BINARY_FUNCTION(atan2) KOKKOS_IMPL_SIMD_BINARY_FUNCTION(copysign) -#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd \ - FUNC(Experimental::simd const& a, \ - Experimental::simd const& b, \ - Experimental::simd const& c) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); \ - ++i) { \ - result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ - } \ - return result; \ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b, \ + Experimental::simd const& c) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ + } \ + return result; \ + } \ + namespace Experimental { \ + template \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION simd \ + FUNC(simd const& a, simd const& b, simd const& c) { \ + return Kokkos::FUNC(a, b, c); \ + } \ } +#else +#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b, \ + Experimental::simd const& c) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ + } \ + return result; \ + } +#endif KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(fma) KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(hypot) diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 6d8145aa6c0..a872fa71c9c 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -424,57 +424,73 @@ class simd> { } }; +} // namespace Experimental + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { - return simd>( +Experimental::simd> abs( + Experimental::simd> const& a) { + return Experimental::simd>( vabsq_f64(static_cast(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> copysign( - simd> const& a, - simd> const& b) { +Experimental::simd> copysign( + Experimental::simd> const& a, + Experimental::simd> const& b) { uint64x2_t const sign_mask = vreinterpretq_u64_f64(vmovq_n_f64(-0.0)); - return simd>(vreinterpretq_f64_u64( - vorrq_u64(vreinterpretq_u64_f64(static_cast(abs(a))), - vandq_u64(sign_mask, vreinterpretq_u64_f64( - static_cast(b)))))); + return Experimental::simd>( + vreinterpretq_f64_u64(vorrq_u64( + vreinterpretq_u64_f64(static_cast(abs(a))), + vandq_u64(sign_mask, + vreinterpretq_u64_f64(static_cast(b)))))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> sqrt( - simd> const& a) { - return simd>( +Experimental::simd> sqrt( + Experimental::simd> const& a) { + return Experimental::simd>( vsqrtq_f64(static_cast(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> fma( - simd> const& a, - simd> const& b, - simd> const& c) { - return simd>( +Experimental::simd> fma( + Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd> const& c) { + return Experimental::simd>( vfmaq_f64(static_cast(c), static_cast(b), static_cast(a))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> max( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> max( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( vmaxq_f64(static_cast(a), static_cast(b))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> min( - simd> const& a, - simd> const& b) { - return simd>( +Experimental::simd> min( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( vminq_f64(static_cast(a), static_cast(b))); } +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, diff --git a/simd/src/Kokkos_SIMD_Scalar.hpp b/simd/src/Kokkos_SIMD_Scalar.hpp index a6031185253..bf6aeb9dd39 100644 --- a/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/simd/src/Kokkos_SIMD_Scalar.hpp @@ -201,9 +201,12 @@ class simd { } }; +} // namespace Experimental + template -[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd abs( - simd const& a) { +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION + Experimental::simd + abs(Experimental::simd const& a) { if constexpr (std::is_signed_v) { return (a < 0 ? -a : a); } @@ -211,19 +214,33 @@ template } template -[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd sqrt( - simd const& a) { - return simd(std::sqrt(static_cast(a))); +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION + Experimental::simd + sqrt(Experimental::simd const& a) { + return Experimental::simd( + std::sqrt(static_cast(a))); +} + +template +KOKKOS_FORCEINLINE_FUNCTION + Experimental::simd + fma(Experimental::simd const& x, + Experimental::simd const& y, + Experimental::simd const& z) { + return Experimental::simd( + (static_cast(x) * static_cast(y)) + static_cast(z)); } template -KOKKOS_FORCEINLINE_FUNCTION simd fma( - simd const& x, simd const& y, - simd const& z) { - return simd((static_cast(x) * static_cast(y)) + - static_cast(z)); +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION + Experimental::simd + copysign(Experimental::simd const& a, + Experimental::simd const& b) { + return std::copysign(static_cast(a), static_cast(b)); } +namespace Experimental { + template KOKKOS_FORCEINLINE_FUNCTION simd condition( desul::Impl::dont_deduce_this_parameter_t< @@ -233,12 +250,6 @@ KOKKOS_FORCEINLINE_FUNCTION simd condition( : static_cast(c)); } -template -[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd copysign( - simd const& a, simd const& b) { - return std::copysign(static_cast(a), static_cast(b)); -} - template class const_where_expression, simd> { diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index 9d1c0ec4e6d..15813a963c4 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -79,7 +79,14 @@ class absolutes { public: template auto on_host(T const& a) const { - return Kokkos::Experimental::abs(a); + if constexpr (std::is_signed_v) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + return Kokkos::Experimental::abs(a); +#else + return Kokkos::abs(a); +#endif + } + return a; } template auto on_host_serial(T const& a) const { @@ -87,7 +94,14 @@ class absolutes { } template KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { - return Kokkos::Experimental::abs(a); + if constexpr (std::is_signed_v) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + return Kokkos::Experimental::abs(a); +#else + return Kokkos::abs(a); +#endif + } + return a; } template KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { From f6977cf431cf245341e6dd2f744c79c5a62a315e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 25 Sep 2023 13:40:28 -0400 Subject: [PATCH 014/432] Check for default device --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 2ed3608fda2..15632ab5154 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -867,6 +867,11 @@ Cuda::Cuda(int device_id, cudaStream_t stream) << " Raised by Kokkos::Cuda::Cuda().\n"; Kokkos::abort(ss.str().c_str()); } + // FIXME_CUDA + if (device_id != Cuda().cuda_device()) + Kokkos::abort( + "Currently, the device id must match the device id used when Kokkos " + "was initialized!"); m_space_instance->initialize(device_id, stream, /*manage_stream*/ false); } From 6c6a26ab1f00102b863d544a17ff562878028e80 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 27 Jun 2023 13:19:45 +0200 Subject: [PATCH 015/432] Add parallel_scan overloads with value for HIP backend --- core/src/HIP/Kokkos_HIP_Team.hpp | 30 +++++++++++++++++++++++++++++- core/unit_test/TestTeamVector.hpp | 14 +++++++------- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Team.hpp b/core/src/HIP/Kokkos_HIP_Team.hpp index 14214f82eb0..e32d2f42553 100644 --- a/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Team.hpp @@ -797,7 +797,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< // exclusive scan -- the final accumulation // of i's val will be included in the second // closure call later. - if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); + if (i - 1 < loop_boundaries.end && threadIdx.x > 0) + closure(i - 1, val, false); // Bottom up exclusive scan in triangular pattern // where each HIP thread is the root of a reduction tree @@ -826,6 +827,7 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< if (i < loop_boundaries.end) closure(i, val, true); Impl::in_place_shfl(accum, val, blockDim.x - 1, blockDim.x); } + reducer.reference() = accum; #else (void)loop_boundaries; (void)closure; @@ -855,6 +857,32 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); } +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure, ValueType& return_val) { + // Extract ValueType from the Closure + using closure_value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + ValueType>::value_type; + static_assert(std::is_same::value, + "Non-matching value types of closure and return type"); + + ValueType accum; + parallel_scan(loop_boundaries, closure, Kokkos::Sum(accum)); + + return_val = accum; +} + } // namespace Kokkos namespace Kokkos { diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 3825ab28559..1a0a1798a56 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -607,9 +607,9 @@ struct functor_vec_scan { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENACC) && \ - !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_THREADS) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ + !defined(KOKKOS_ENABLE_HPX) template struct functor_vec_scan_ret_val { using policy_type = Kokkos::TeamPolicy; @@ -736,9 +736,9 @@ bool test_scalar(int nteams, int team_size, int test) { } else if (test == 12) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENACC) && \ - !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_THREADS) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ + !defined(KOKKOS_ENABLE_HPX) Kokkos::parallel_for( Kokkos::TeamPolicy(nteams, team_size, 8), functor_vec_scan_ret_val(d_flag, team_size)); @@ -1016,7 +1016,7 @@ struct checkScan { }; } // namespace VectorScanReducer -#if !(defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ENABLE_HIP)) +#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(0))); ASSERT_TRUE((TestTeamVector::Test(1))); From df1901b1c004402cbaaf388b38d01b259569174f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 27 Sep 2023 12:03:53 +0200 Subject: [PATCH 016/432] Use std::is_same_v --- core/src/HIP/Kokkos_HIP_Team.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP_Team.hpp b/core/src/HIP/Kokkos_HIP_Team.hpp index e32d2f42553..fb466d8a721 100644 --- a/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Team.hpp @@ -874,7 +874,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( using closure_value_type = typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, ValueType>::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of closure and return type"); ValueType accum; From e4eb204ee63691254ba0cd6f651345e456709092 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Fri, 14 Jul 2023 17:12:29 +0200 Subject: [PATCH 017/432] #5635: Move some tests for parallel_scan to TestTeamScan --- core/unit_test/TestTeamScan.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestTeamScan.hpp b/core/unit_test/TestTeamScan.hpp index d084e8b6571..af3b71f5319 100644 --- a/core/unit_test/TestTeamScan.hpp +++ b/core/unit_test/TestTeamScan.hpp @@ -132,8 +132,8 @@ TEST(TEST_CATEGORY, team_scan) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ + !defined(KOKKOS_ENABLE_HPX) template struct TestTeamScanRetVal { using execution_space = ExecutionSpace; From 7d817b88b36d8ff95dadc45e61c27017a61a670c Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 25 Jul 2023 15:03:01 +0200 Subject: [PATCH 018/432] #5635: SYCL: Add parallel_scan overload with return value --- core/src/SYCL/Kokkos_SYCL_Team.hpp | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 0a9f3b90503..10f72520f77 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -573,15 +573,17 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< * final == true. */ // This is the same code as in CUDA and largely the same as in OpenMPTarget -template +template KOKKOS_INLINE_FUNCTION void parallel_scan( const Impl::TeamThreadRangeBoundariesStruct& loop_bounds, - const FunctorType& lambda) { - // Extract value_type from lambda - using value_type = typename Kokkos::Impl::FunctorAnalysis< + const FunctorType& lambda, ValueType& return_val) { + // Extract ValueType from the Closure + using closure_value_type = typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, void>::value_type; + static_assert(std::is_same::value, + "Non-matching value types of closure and return type"); const auto start = loop_bounds.start; const auto end = loop_bounds.end; @@ -589,12 +591,12 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const auto team_size = member.team_size(); const auto team_rank = member.team_rank(); const auto nchunk = (end - start + team_size - 1) / team_size; - value_type accum = 0; + ValueType accum = 0; // each team has to process one or more chunks of the prefix scan for (iType i = 0; i < nchunk; ++i) { auto ii = start + i * team_size + team_rank; // local accumulation for this chunk - value_type local_accum = 0; + ValueType local_accum = 0; // user updates value with prefix value if (ii < loop_bounds.end) lambda(ii, local_accum, false); // perform team scan @@ -608,6 +610,21 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( // broadcast last value to rest of the team member.team_broadcast(accum, team_size - 1); } + + return_val = accum; +} + +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct& + loop_bounds, + const FunctorType& lambda) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; + + value_type scan_val; + parallel_scan(loop_bounds, lambda, scan_val); } template From bbfe639818baf4d315b1df555d62654b0fca54dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 27 Sep 2023 12:17:41 +0200 Subject: [PATCH 019/432] Use std::is_same_v --- core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 10f72520f77..a6be78d5e96 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -582,7 +582,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( using closure_value_type = typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, void>::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of closure and return type"); const auto start = loop_bounds.start; From b610a288bff1e5de9ef46f7c0ae901b38c9b2ae2 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 27 Sep 2023 10:08:02 -0400 Subject: [PATCH 020/432] OpenMP: Fix TeamThreadRange parallel_scan with return value for team_size > 1 --- core/src/impl/Kokkos_HostThreadTeam.hpp | 6 +++++- core/unit_test/TestTeamScan.hpp | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 70e5566d665..51f25a8b60f 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,14 +885,18 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } + auto team_member = loop_boundaries.thread; + // 'accum' output is the exclusive prefix sum - accum = loop_boundaries.thread.team_scan(accum); + accum = team_member.team_scan(accum); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { closure(i, accum, true); } + team_member.team_broadcast(accum, team_member.team_size() - 1); + return_val = accum; } diff --git a/core/unit_test/TestTeamScan.hpp b/core/unit_test/TestTeamScan.hpp index d084e8b6571..e20f9503e52 100644 --- a/core/unit_test/TestTeamScan.hpp +++ b/core/unit_test/TestTeamScan.hpp @@ -185,7 +185,14 @@ struct TestTeamScanRetVal { a_r = view_2d_type("a_r", M, N); a_s = view_1d_type("a_s", M); - Kokkos::parallel_for(policy_type(M, Kokkos::AUTO), *this); + // Set team size explicitly to check whether non-power-of-two team sizes can + // be used. + if (ExecutionSpace().concurrency() > 10000) + Kokkos::parallel_for(policy_type(M, 127), *this); + else if (ExecutionSpace().concurrency() > 2) + Kokkos::parallel_for(policy_type(M, 3), *this); + else + Kokkos::parallel_for(policy_type(M, 1), *this); Kokkos::fence(); auto a_i = Kokkos::create_mirror_view(a_d); From 82044c6963c2fdb34b3a27d5635fd21f268a3ee3 Mon Sep 17 00:00:00 2001 From: Maarten Arnst Date: Sun, 1 Oct 2023 19:15:39 +0200 Subject: [PATCH 021/432] Add compatible copy assignment operator to DualView --- containers/src/Kokkos_DualView.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index e821570a8d5..84bced2cc44 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -292,6 +292,15 @@ class DualView : public ViewTraits { d_view(src.d_view), h_view(src.h_view) {} + //! Copy assignment operator (shallow copy assignment) + template + DualView& operator=(const DualView& src) { + modified_flags = src.modified_flags; + d_view = src.d_view; + h_view = src.h_view; + return *this; + } + //! Subview constructor template DualView(const DualView& src, const Arg0& arg0, Args... args) From e1f2cf545be8f7c91de73d48267d0344f7bd4696 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 2 Oct 2023 19:35:26 +0000 Subject: [PATCH 022/432] Fix minimum version for Google benchmark --- core/perf_test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 0ed0e62ba98..7f3916da312 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -38,7 +38,7 @@ IF (KOKKOS_HAS_TRILINOS) ENDIF() # Find or download google/benchmark library -find_package(benchmark QUIET) +find_package(benchmark QUIET 1.5.6) IF(benchmark_FOUND) MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") ELSE() From 39316fa8c80f238a9eb0fb67e4d904f42fe93fa4 Mon Sep 17 00:00:00 2001 From: Maarten Arnst Date: Mon, 2 Oct 2023 20:18:13 +0200 Subject: [PATCH 023/432] Add test of copy constructor/assignment operator for DualView. --- containers/unit_tests/TestDualView.hpp | 51 ++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index 6e2e56c5cf7..54086914bd0 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -60,6 +60,48 @@ struct test_dualview_alloc { } }; +template +struct test_dualview_copy_construction_and_assignment { + using scalar_type = Scalar; + using execution_space = Device; + + template + void run_me() { + const unsigned int n = 10; + const unsigned int m = 5; + + ViewType a("A", n, m); + + // Copy construction + ViewType b(a); + + // Copy assignment + ViewType c = a; + + // Check equality (shallow) of the host and device views + ASSERT_EQ(a.view_host(), b.view_host()); + ASSERT_EQ(a.view_device(), b.view_device()); + + ASSERT_EQ(a.view_host(), c.view_host()); + ASSERT_EQ(a.view_device(), c.view_device()); + + // We can't test shallow equality of modified_flags because it's protected. + // So we test it indirectly through sync state behavior. + if (!std::decay_t::impl_dualview_is_single_device::value) { + a.clear_sync_state(); + a.modify_host(); + ASSERT_TRUE(a.need_sync_device()); + ASSERT_TRUE(b.need_sync_device()); + ASSERT_TRUE(c.need_sync_device()); + a.clear_sync_state(); + } + } + + test_dualview_copy_construction_and_assignment() { + run_me >(); + } +}; + template struct test_dualview_combinations { using self_type = test_dualview_combinations; @@ -379,6 +421,11 @@ void test_dualview_alloc(unsigned int size) { ASSERT_TRUE(test.result); } +template +void test_dualview_copy_construction_and_assignment() { + Impl::test_dualview_copy_construction_and_assignment(); +} + template void test_dualview_deep_copy() { Impl::test_dual_view_deep_copy(); @@ -404,6 +451,10 @@ TEST(TEST_CATEGORY, dualview_alloc) { test_dualview_alloc(10); } +TEST(TEST_CATEGORY, test_dualview_copy_construction_and_assignment) { + test_dualview_copy_construction_and_assignment(); +} + TEST(TEST_CATEGORY, dualview_combinations_without_init) { test_dualview_combinations(10, false); } From 7447118649e2accfa8c7102fcb75225439e5e7a3 Mon Sep 17 00:00:00 2001 From: Maarten Arnst Date: Tue, 3 Oct 2023 09:33:02 +0200 Subject: [PATCH 024/432] Compute concurrency on HIP using Kokkos hardcoded m_maxWavesPerCU --- core/src/HIP/Kokkos_HIP_Instance.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 0740866439d..7f04eb721cb 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -76,8 +76,8 @@ std::size_t scratch_count(const std::size_t size) { //---------------------------------------------------------------------------- int HIPInternal::concurrency() { - static int const concurrency = m_deviceProp.maxThreadsPerMultiProcessor * - m_deviceProp.multiProcessorCount; + static int const concurrency = m_maxThreadsPerSM * m_multiProcCount; + return concurrency; } From 94c5d9ab61f89c58de6119618279fa140e78b37e Mon Sep 17 00:00:00 2001 From: Maarten Arnst Date: Tue, 3 Oct 2023 15:40:58 +0200 Subject: [PATCH 025/432] Modify test so that source and destination view are of different type --- containers/unit_tests/TestDualView.hpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index 54086914bd0..b4bfca5ca82 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -65,18 +65,21 @@ struct test_dualview_copy_construction_and_assignment { using scalar_type = Scalar; using execution_space = Device; - template void run_me() { - const unsigned int n = 10; - const unsigned int m = 5; + constexpr unsigned int n = 10; + constexpr unsigned int m = 5; - ViewType a("A", n, m); + using SrcViewType = Kokkos::DualView; + using DstViewType = + Kokkos::DualView; + + SrcViewType a("A", n, m); // Copy construction - ViewType b(a); + DstViewType b(a); // Copy assignment - ViewType c = a; + DstViewType c = a; // Check equality (shallow) of the host and device views ASSERT_EQ(a.view_host(), b.view_host()); @@ -87,7 +90,7 @@ struct test_dualview_copy_construction_and_assignment { // We can't test shallow equality of modified_flags because it's protected. // So we test it indirectly through sync state behavior. - if (!std::decay_t::impl_dualview_is_single_device::value) { + if (!std::decay_t::impl_dualview_is_single_device::value) { a.clear_sync_state(); a.modify_host(); ASSERT_TRUE(a.need_sync_device()); @@ -97,9 +100,7 @@ struct test_dualview_copy_construction_and_assignment { } } - test_dualview_copy_construction_and_assignment() { - run_me >(); - } + test_dualview_copy_construction_and_assignment() { run_me(); } }; template From d8f8142ab77fd354651cf208d49f3e7c50d35070 Mon Sep 17 00:00:00 2001 From: Maarten Arnst Date: Wed, 4 Oct 2023 16:45:53 +0200 Subject: [PATCH 026/432] Use call operator instead of run_me function --- containers/unit_tests/TestDualView.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index b4bfca5ca82..a15e5fa2997 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -65,14 +65,14 @@ struct test_dualview_copy_construction_and_assignment { using scalar_type = Scalar; using execution_space = Device; - void run_me() { + void operator()() { constexpr unsigned int n = 10; constexpr unsigned int m = 5; using SrcViewType = Kokkos::DualView; using DstViewType = Kokkos::DualView; - + SrcViewType a("A", n, m); // Copy construction @@ -99,8 +99,6 @@ struct test_dualview_copy_construction_and_assignment { a.clear_sync_state(); } } - - test_dualview_copy_construction_and_assignment() { run_me(); } }; template @@ -424,7 +422,7 @@ void test_dualview_alloc(unsigned int size) { template void test_dualview_copy_construction_and_assignment() { - Impl::test_dualview_copy_construction_and_assignment(); + Impl::test_dualview_copy_construction_and_assignment()(); } template From 60e4d1359f1418e0778c9b68c405990798d25c04 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Wed, 4 Oct 2023 19:33:40 +0200 Subject: [PATCH 027/432] team-level std algos: part 12 (#6350) Add team-level exclusive scan and transform exclusive scan algorithms --- .../std_algorithms/Kokkos_ExclusiveScan.hpp | 243 ++++++++++++----- .../Kokkos_TransformExclusiveScan.hpp | 130 ++++++--- .../impl/Kokkos_ExclusiveScan.hpp | 238 ++++++++-------- .../impl/Kokkos_FunctorsForExclusiveScan.hpp | 220 +++++++++++++++ .../impl/Kokkos_TransformExclusiveScan.hpp | 122 ++++----- algorithms/unit_tests/CMakeLists.txt | 14 +- .../TestStdAlgorithmsExclusiveScan.cpp | 5 +- .../TestStdAlgorithmsTeamExclusiveScan.cpp | 253 ++++++++++++++++++ ...tdAlgorithmsTeamTransformExclusiveScan.cpp | 228 ++++++++++++++++ ...estStdAlgorithmsTransformExclusiveScan.cpp | 2 +- 10 files changed, 1158 insertions(+), 297 deletions(-) create mode 100644 algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp diff --git a/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp index 4e05676c2c1..ee3a1051264 100644 --- a/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp @@ -23,105 +23,130 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - ValueType init_value) { - static_assert(std::is_move_constructible::value, +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value) { + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_default_op_impl( + return Impl::exclusive_scan_default_op_exespace_impl( "Kokkos::exclusive_scan_default_functors_iterator_api", ex, first, last, - first_dest, init_value); + first_dest, std::move(init_value)); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, ValueType init_value) { - static_assert(std::is_move_constructible::value, +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value) { + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_default_op_impl(label, ex, first, last, - first_dest, init_value); + return Impl::exclusive_scan_default_op_exespace_impl( + label, ex, first, last, first_dest, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_default_op_impl( + return Impl::exclusive_scan_default_op_exespace_impl( "Kokkos::exclusive_scan_default_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), - KE::begin(view_dest), init_value); + return Impl::exclusive_scan_default_op_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value)); } // overload set 2 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - ValueType init_value, BinaryOpType bop) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_custom_op_impl( + return Impl::exclusive_scan_custom_op_exespace_impl( "Kokkos::exclusive_scan_custom_functors_iterator_api", ex, first, last, - first_dest, init_value, bop); + first_dest, std::move(init_value), bop); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, ValueType init_value, - BinaryOpType bop) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_custom_op_impl(label, ex, first, last, first_dest, - init_value, bop); + return Impl::exclusive_scan_custom_op_exespace_impl( + label, ex, first, last, first_dest, std::move(init_value), bop); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -129,18 +154,20 @@ auto exclusive_scan(const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_custom_op_impl( + return Impl::exclusive_scan_custom_op_exespace_impl( "Kokkos::exclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value, bop); + std::move(init_value), bop); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -148,12 +175,92 @@ auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_custom_op_impl( + return Impl::exclusive_scan_custom_op_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), init_value, bop); + KE::begin(view_dest), std::move(init_value), bop); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1 +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType +exclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::exclusive_scan_default_op_team_impl( + teamHandle, first, last, first_dest, std::move(init_value)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto exclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + ValueType init_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; + return Impl::exclusive_scan_default_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value)); +} + +// overload set 2 +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType +exclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { + Impl::static_assert_is_not_openmptarget(teamHandle); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::exclusive_scan_custom_op_team_impl( + teamHandle, first, last, first_dest, std::move(init_value), bop); +} + +template , int> = 0> +KOKKOS_FUNCTION auto exclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + ValueType init_value, BinaryOpType bop) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; + return Impl::exclusive_scan_custom_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value), bop); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp index 9d85aee06f8..37fc0f860ee 100644 --- a/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp @@ -23,44 +23,52 @@ namespace Kokkos { namespace Experimental { -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_exclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - ValueType init_value, BinaryOpType binary_op, - UnaryOpType unary_op) { +// +// overload set accepting execution space +// +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_exclusive_scan( + const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, + OutputIteratorType first_dest, ValueType init_value, BinaryOpType binary_op, + UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::transform_exclusive_scan_impl( + return Impl::transform_exclusive_scan_exespace_impl( "Kokkos::transform_exclusive_scan_custom_functors_iterator_api", ex, - first, last, first_dest, init_value, binary_op, unary_op); + first, last, first_dest, std::move(init_value), binary_op, unary_op); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_exclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, ValueType init_value, - BinaryOpType binary_op, UnaryOpType unary_op) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_exclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, + BinaryOpType binary_op, UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::transform_exclusive_scan_impl(label, ex, first, last, first_dest, - init_value, binary_op, unary_op); + return Impl::transform_exclusive_scan_exespace_impl( + label, ex, first, last, first_dest, std::move(init_value), binary_op, + unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_exclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -69,18 +77,20 @@ auto transform_exclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::transform_exclusive_scan_impl( + return Impl::transform_exclusive_scan_exespace_impl( "Kokkos::transform_exclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value, binary_op, unary_op); + std::move(init_value), binary_op, unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_exclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -89,12 +99,56 @@ auto transform_exclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::transform_exclusive_scan_impl( + return Impl::transform_exclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), init_value, binary_op, unary_op); + KE::begin(view_dest), std::move(init_value), binary_op, unary_op); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template && :: + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType transform_exclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, + BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::transform_exclusive_scan_team_impl( + teamHandle, first, last, first_dest, std::move(init_value), binary_op, + unary_op); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform_exclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; + return Impl::transform_exclusive_scan_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value), binary_op, unary_op); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp index 71f13e490af..6da992b4bbe 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp @@ -22,6 +22,7 @@ #include "Kokkos_HelperPredicates.hpp" #include "Kokkos_ValueWrapperForNoNeutralElement.hpp" #include "Kokkos_IdentityReferenceUnaryFunctor.hpp" +#include "Kokkos_FunctorsForExclusiveScan.hpp" #include #include #include @@ -30,127 +31,15 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template -struct ExclusiveScanDefaultFunctorForKnownNeutralElement { - using execution_space = ExeSpace; - - ValueType m_init_value; - FirstFrom m_first_from; - FirstDest m_first_dest; - - KOKKOS_FUNCTION - ExclusiveScanDefaultFunctorForKnownNeutralElement(ValueType init, - FirstFrom first_from, - FirstDest first_dest) - : m_init_value(std::move(init)), - m_first_from(std::move(first_from)), - m_first_dest(std::move(first_dest)) {} - - KOKKOS_FUNCTION - void operator()(const IndexType i, ValueType& update, - const bool final_pass) const { - if (final_pass) m_first_dest[i] = update + m_init_value; - update += m_first_from[i]; - } -}; - -template -struct ExclusiveScanDefaultFunctor { - using execution_space = ExeSpace; - using value_type = - ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement; - - ValueType m_init_value; - FirstFrom m_first_from; - FirstDest m_first_dest; - - KOKKOS_FUNCTION - ExclusiveScanDefaultFunctor(ValueType init, FirstFrom first_from, - FirstDest first_dest) - : m_init_value(std::move(init)), - m_first_from(std::move(first_from)), - m_first_dest(std::move(first_dest)) {} - - KOKKOS_FUNCTION - void operator()(const IndexType i, value_type& update, - const bool final_pass) const { - if (final_pass) { - if (i == 0) { - m_first_dest[i] = m_init_value; - } else { - m_first_dest[i] = update.val + m_init_value; - } - } - - const auto tmp = value_type{m_first_from[i], false}; - this->join(update, tmp); - } - - KOKKOS_FUNCTION - void init(value_type& update) const { - update.val = {}; - update.is_initial = true; - } - - KOKKOS_FUNCTION - void join(value_type& update, const value_type& input) const { - if (input.is_initial) return; - - if (update.is_initial) { - update.val = input.val; - update.is_initial = false; - } else { - update.val = update.val + input.val; - } - } -}; - +// +// exespace impl +// template -OutputIteratorType exclusive_scan_custom_op_impl( + class OutputIteratorType, class ValueType> +OutputIteratorType exclusive_scan_default_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, - OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { - // checks - Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); - Impl::static_assert_iterators_have_matching_difference_type(first_from, - first_dest); - Impl::expect_valid_range(first_from, last_from); - - // aliases - using index_type = typename InputIteratorType::difference_type; - using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; - using func_type = - TransformExclusiveScanFunctor; - - // run - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); - ::Kokkos::parallel_scan( - label, RangePolicy(ex, 0, num_elements), - func_type(init_value, first_from, first_dest, bop, unary_op_type())); - ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation"); - - // return - return first_dest + num_elements; -} - -template -using ex_scan_has_reduction_identity_sum_t = - decltype(Kokkos::reduction_identity::sum()); - -template -OutputIteratorType exclusive_scan_default_op_impl(const std::string& label, - const ExecutionSpace& ex, - InputIteratorType first_from, - InputIteratorType last_from, - OutputIteratorType first_dest, - ValueType init_value) { + OutputIteratorType first_dest, ValueType init_value) { // checks Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); Impl::static_assert_iterators_have_matching_difference_type(first_from, @@ -184,17 +73,122 @@ OutputIteratorType exclusive_scan_default_op_impl(const std::string& label, ExclusiveScanDefaultFunctorForKnownNeutralElement< ExecutionSpace, index_type, ValueType, InputIteratorType, OutputIteratorType>, - ExclusiveScanDefaultFunctor>; + ExclusiveScanDefaultFunctorWithValueWrapper>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan( + label, RangePolicy(ex, 0, num_elements), + func_type(std::move(init_value), first_from, first_dest)); + + ex.fence("Kokkos::exclusive_scan_default_op: fence after operation"); + + return first_dest + num_elements; +} + +template +OutputIteratorType exclusive_scan_custom_op_exespace_impl( + const std::string& label, const ExecutionSpace& ex, + InputIteratorType first_from, InputIteratorType last_from, + OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { + // checks + Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + // aliases + using index_type = typename InputIteratorType::difference_type; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using func_type = TransformExclusiveScanFunctorWithValueWrapper< + ExecutionSpace, index_type, ValueType, InputIteratorType, + OutputIteratorType, BinaryOpType, unary_op_type>; // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); ::Kokkos::parallel_scan(label, RangePolicy(ex, 0, num_elements), - func_type(init_value, first_from, first_dest)); + func_type(std::move(init_value), first_from, + first_dest, bop, unary_op_type())); + ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation"); - ex.fence("Kokkos::exclusive_scan_default_op: fence after operation"); + // return + return first_dest + num_elements; +} + +// +// team impl +// +template +KOKKOS_FUNCTION OutputIteratorType exclusive_scan_default_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + ValueType init_value) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + static_assert( + ::Kokkos::is_detected_v, + "The team-level impl of Kokkos::Experimental::exclusive_scan currently " + "does not support types without reduction identity"); + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using index_type = typename InputIteratorType::difference_type; + using func_type = ExclusiveScanDefaultFunctorForKnownNeutralElement< + exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType>; + + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan( + TeamThreadRange(teamHandle, 0, num_elements), + func_type(std::move(init_value), first_from, first_dest)); + teamHandle.team_barrier(); + return first_dest + num_elements; +} + +template +KOKKOS_FUNCTION OutputIteratorType exclusive_scan_custom_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + static_assert( + ::Kokkos::is_detected_v, + "The team-level impl of Kokkos::Experimental::exclusive_scan currently " + "does not support types without reduction identity"); + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using index_type = typename InputIteratorType::difference_type; + using func_type = TransformExclusiveScanFunctorWithoutValueWrapper< + exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType, + BinaryOpType, unary_op_type>; + + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + func_type(std::move(init_value), first_from, + first_dest, bop, unary_op_type())); + teamHandle.team_barrier(); return first_dest + num_elements; } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp new file mode 100644 index 00000000000..8151ee34955 --- /dev/null +++ b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp @@ -0,0 +1,220 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_STD_ALGORITHMS_FUNCTORS_FOR_EXCLUSIVE_SCAN_IMPL_HPP +#define KOKKOS_STD_ALGORITHMS_FUNCTORS_FOR_EXCLUSIVE_SCAN_IMPL_HPP + +#include +#include "Kokkos_ValueWrapperForNoNeutralElement.hpp" + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template +using ex_scan_has_reduction_identity_sum_t = + decltype(Kokkos::reduction_identity::sum()); + +template +struct ExclusiveScanDefaultFunctorForKnownNeutralElement { + using execution_space = ExeSpace; + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + + KOKKOS_FUNCTION + ExclusiveScanDefaultFunctorForKnownNeutralElement(ValueType init, + FirstFrom first_from, + FirstDest first_dest) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, ValueType& update, + const bool final_pass) const { + if (final_pass) m_first_dest[i] = update + m_init_value; + update += m_first_from[i]; + } +}; + +template +struct ExclusiveScanDefaultFunctorWithValueWrapper { + using execution_space = ExeSpace; + using value_type = + ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement; + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + + KOKKOS_FUNCTION + ExclusiveScanDefaultFunctorWithValueWrapper(ValueType init, + FirstFrom first_from, + FirstDest first_dest) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, value_type& update, + const bool final_pass) const { + if (final_pass) { + if (i == 0) { + m_first_dest[i] = m_init_value; + } else { + m_first_dest[i] = update.val + m_init_value; + } + } + + const auto tmp = value_type{m_first_from[i], false}; + this->join(update, tmp); + } + + KOKKOS_FUNCTION + void init(value_type& update) const { + update.val = {}; + update.is_initial = true; + } + + KOKKOS_FUNCTION + void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + + if (update.is_initial) { + update.val = input.val; + update.is_initial = false; + } else { + update.val = update.val + input.val; + } + } +}; + +template +struct TransformExclusiveScanFunctorWithValueWrapper { + using execution_space = ExeSpace; + using value_type = + ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement; + + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + BinaryOpType m_binary_op; + UnaryOpType m_unary_op; + + KOKKOS_FUNCTION + TransformExclusiveScanFunctorWithValueWrapper(ValueType init, + FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)), + m_binary_op(std::move(bop)), + m_unary_op(std::move(uop)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, value_type& update, + const bool final_pass) const { + if (final_pass) { + if (i == 0) { + // for both ExclusiveScan and TransformExclusiveScan, + // init is unmodified + m_first_dest[i] = m_init_value; + } else { + m_first_dest[i] = m_binary_op(update.val, m_init_value); + } + } + + const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; + this->join(update, tmp); + } + + KOKKOS_FUNCTION void init(value_type& value) const { + value.val = {}; + value.is_initial = true; + } + + KOKKOS_FUNCTION + void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + + if (update.is_initial) { + update.val = input.val; + } else { + update.val = m_binary_op(update.val, input.val); + } + update.is_initial = false; + } +}; + +template +struct TransformExclusiveScanFunctorWithoutValueWrapper { + using execution_space = ExeSpace; + + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + BinaryOpType m_binary_op; + UnaryOpType m_unary_op; + + KOKKOS_FUNCTION + TransformExclusiveScanFunctorWithoutValueWrapper(ValueType init, + FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)), + m_binary_op(std::move(bop)), + m_unary_op(std::move(uop)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, ValueType& update, + const bool final_pass) const { + if (final_pass) { + if (i == 0) { + // for both ExclusiveScan and TransformExclusiveScan, + // init is unmodified + m_first_dest[i] = m_init_value; + } else { + m_first_dest[i] = m_binary_op(update, m_init_value); + } + } + + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; + this->join(update, tmp); + } + + KOKKOS_FUNCTION + void init(ValueType& update) const { update = {}; } + + KOKKOS_FUNCTION + void join(ValueType& update, const ValueType& input) const { + update = m_binary_op(update, input); + } +}; + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp index 3bb337de36f..d832f8849d1 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp @@ -21,6 +21,7 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include "Kokkos_ValueWrapperForNoNeutralElement.hpp" +#include "Kokkos_FunctorsForExclusiveScan.hpp" #include #include @@ -28,69 +29,13 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template -struct TransformExclusiveScanFunctor { - using execution_space = ExeSpace; - using value_type = - ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement; - - ValueType m_init_value; - FirstFrom m_first_from; - FirstDest m_first_dest; - BinaryOpType m_binary_op; - UnaryOpType m_unary_op; - - KOKKOS_FUNCTION - TransformExclusiveScanFunctor(ValueType init, FirstFrom first_from, - FirstDest first_dest, BinaryOpType bop, - UnaryOpType uop) - : m_init_value(std::move(init)), - m_first_from(std::move(first_from)), - m_first_dest(std::move(first_dest)), - m_binary_op(std::move(bop)), - m_unary_op(std::move(uop)) {} - - KOKKOS_FUNCTION - void operator()(const IndexType i, value_type& update, - const bool final_pass) const { - if (final_pass) { - if (i == 0) { - // for both ExclusiveScan and TransformExclusiveScan, - // init is unmodified - m_first_dest[i] = m_init_value; - } else { - m_first_dest[i] = m_binary_op(update.val, m_init_value); - } - } - - const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; - this->join(update, tmp); - } - - KOKKOS_FUNCTION - void init(value_type& update) const { - update.val = {}; - update.is_initial = true; - } - - KOKKOS_FUNCTION - void join(value_type& update, const value_type& input) const { - if (input.is_initial) return; - - if (update.is_initial) { - update.val = input.val; - } else { - update.val = m_binary_op(update.val, input.val); - } - update.is_initial = false; - } -}; - +// +// exespace impl +// template -OutputIteratorType transform_exclusive_scan_impl( +OutputIteratorType transform_exclusive_scan_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop, @@ -103,23 +48,70 @@ OutputIteratorType transform_exclusive_scan_impl( // aliases using index_type = typename InputIteratorType::difference_type; - using func_type = - TransformExclusiveScanFunctor; + + using func_type = std::conditional_t< + ::Kokkos::is_detected::value, + TransformExclusiveScanFunctorWithoutValueWrapper< + ExecutionSpace, index_type, ValueType, InputIteratorType, + OutputIteratorType, BinaryOpType, UnaryOpType>, + TransformExclusiveScanFunctorWithValueWrapper< + ExecutionSpace, index_type, ValueType, InputIteratorType, + OutputIteratorType, BinaryOpType, UnaryOpType> >; // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); ::Kokkos::parallel_scan( label, RangePolicy(ex, 0, num_elements), - func_type(init_value, first_from, first_dest, bop, uop)); + func_type(std::move(init_value), first_from, first_dest, bop, uop)); ex.fence("Kokkos::transform_exclusive_scan: fence after operation"); // return return first_dest + num_elements; } +// +// team impl +// +template +KOKKOS_FUNCTION OutputIteratorType transform_exclusive_scan_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop, UnaryOpType uop) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + static_assert( + ::Kokkos::is_detected_v, + "The team-level impl of Kokkos::Experimental::transform_exclusive_scan " + "currently does not support types without reduction identity"); + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using index_type = typename InputIteratorType::difference_type; + using func_type = TransformExclusiveScanFunctorWithoutValueWrapper< + exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType, + BinaryOpType, UnaryOpType>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan( + TeamThreadRange(teamHandle, 0, num_elements), + func_type(std::move(init_value), first_from, first_dest, bop, uop)); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt index 74611c677fc..cf5a0e7f68d 100644 --- a/algorithms/unit_tests/CMakeLists.txt +++ b/algorithms/unit_tests/CMakeLists.txt @@ -153,6 +153,18 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) endforeach() + # ------------------------------------------ + # std team P + # ------------------------------------------ + set(STDALGO_TEAM_SOURCES_P) + foreach(Name + StdAlgorithmsCommon + StdAlgorithmsTeamExclusiveScan + StdAlgorithmsTeamTransformExclusiveScan + ) + list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) + endforeach() + # ------------------------------------------ # std team M # ------------------------------------------ @@ -419,7 +431,7 @@ foreach(ID A;B;C;D;E) ) endforeach() -foreach(ID A;B;C;D;E;F;G;H;I;L;M) +foreach(ID A;B;C;D;E;F;G;H;I;L;M;P) KOKKOS_ADD_EXECUTABLE_AND_TEST( AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES diff --git a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 799de8b0c49..6ab68a1987d 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -348,8 +348,9 @@ TEST(std_algorithms_numeric_ops_test, exclusive_scan_functor) { int dummy = 0; using view_type = Kokkos::View; view_type dummy_view("dummy_view", 0); - using functor_type = Kokkos::Experimental::Impl::ExclusiveScanDefaultFunctor< - exespace, int, int, view_type, view_type>; + using functor_type = + Kokkos::Experimental::Impl::ExclusiveScanDefaultFunctorWithValueWrapper< + exespace, int, int, view_type, view_type>; functor_type functor(dummy, dummy_view, dummy_view); using value_type = functor_type::value_type; diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp new file mode 100644 index 00000000000..c6b2566c6cf --- /dev/null +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -0,0 +1,253 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace Test { +namespace stdalgos { +namespace TeamExclusiveScan { + +namespace KE = Kokkos::Experimental; + +template +struct PlusFunctor { + KOKKOS_INLINE_FUNCTION constexpr ValueType operator()( + const ValueType& lhs, const ValueType& rhs) const { + return lhs + rhs; + } +}; + +template +struct TestFunctorA { + SourceViewType m_sourceView; + DestViewType m_destView; + DistancesViewType m_distancesView; + IntraTeamSentinelView m_intraTeamSentinelView; + InitValuesViewType m_initValuesView; + BinaryOpType m_binaryOp; + int m_apiPick; + + TestFunctorA(const SourceViewType sourceView, const DestViewType destView, + const DistancesViewType distancesView, + const IntraTeamSentinelView intraTeamSentinelView, + const InitValuesViewType initValuesView, BinaryOpType binaryOp, + int apiPick) + : m_sourceView(sourceView), + m_destView(destView), + m_distancesView(distancesView), + m_intraTeamSentinelView(intraTeamSentinelView), + m_initValuesView(initValuesView), + m_binaryOp(binaryOp), + m_apiPick(apiPick) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const { + const auto rowIndex = member.league_rank(); + + auto rowViewSrc = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL()); + auto rowViewDest = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL()); + const auto initVal = m_initValuesView(rowIndex); + ptrdiff_t resultDist = 0; + + switch (m_apiPick) { + case 0: { + auto it = KE::exclusive_scan(member, KE::cbegin(rowViewSrc), + KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal); + resultDist = KE::distance(KE::begin(rowViewDest), it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 1: { + auto it = KE::exclusive_scan(member, rowViewSrc, rowViewDest, initVal); + resultDist = KE::distance(KE::begin(rowViewDest), it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + +#if not defined KOKKOS_ENABLE_OPENMPTARGET + + case 2: { + auto it = KE::exclusive_scan( + member, KE::cbegin(rowViewSrc), KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal, m_binaryOp); + resultDist = KE::distance(KE::begin(rowViewDest), it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 3: { + auto it = KE::exclusive_scan(member, rowViewSrc, rowViewDest, initVal, + m_binaryOp); + resultDist = KE::distance(KE::begin(rowViewDest), it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } +#endif + } + + // store result of checking if all members have their local + // values matching the one stored in m_distancesView + member.team_barrier(); + const bool intraTeamCheck = team_members_have_matching_result( + member, resultDist, m_distancesView(rowIndex)); + Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { + m_intraTeamSentinelView(rowIndex) = intraTeamCheck; + }); + } +}; + +template +void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { + /* description: + use a rank-2 view randomly filled with values, + and run a team-level exclusive_scan + */ + + // ----------------------------------------------- + // prepare data + // ----------------------------------------------- + // create a view in the memory space associated with default exespace + // with as many rows as the number of teams and fill it with random + // values from an arbitrary range. + constexpr ValueType lowerBound = 5; + constexpr ValueType upperBound = 523; + const auto bounds = make_bounds(lowerBound, upperBound); + + auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone( + LayoutTag{}, numTeams, numCols, bounds, "sourceView"); + + // ----------------------------------------------- + // launch kokkos kernel + // ----------------------------------------------- + using space_t = Kokkos::DefaultExecutionSpace; + Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + + // exclusive_scan returns an iterator so to verify that it is correct + // each team stores the distance of the returned iterator from the beginning + // of the interval that team operates on and then we check that these + // distances match the std result + Kokkos::View distancesView("distancesView", numTeams); + // sentinel to check if all members of the team compute the same result + Kokkos::View intraTeamSentinelView("intraTeamSameResult", numTeams); + + PlusFunctor binaryOp; + + // Create view of reduce init values to be used by test cases + Kokkos::View initValuesView_h( + "initValuesView_h", numTeams); + using rand_pool = + Kokkos::Random_XorShift64_Pool; + rand_pool pool(lowerBound * upperBound); + Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); + + // use CTAD for functor + auto initValuesView = + Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + + // ----------------------------------------------- + // run cpp-std kernel and check + // ----------------------------------------------- + auto distancesView_h = create_host_space_copy(distancesView); + auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); + Kokkos::View stdDestView("stdDestView", + numTeams, numCols); + + for (std::size_t i = 0; i < sourceView.extent(0); ++i) { + auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); + auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); + auto initValue = initValuesView_h(i); + + ASSERT_TRUE(intraTeamSentinelView_h(i)); + +// libstdc++ as provided by GCC 8 does not have exclusive_scan and +// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy +#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9) +#define exclusive_scan testing_exclusive_scan +#else +#define exclusive_scan std::exclusive_scan +#endif + switch (apiId) { + case 0: + case 1: { + auto it = exclusive_scan(KE::cbegin(rowFrom), KE::cend(rowFrom), + KE::begin(rowDest), initValue); + const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + break; + } + +#if not defined KOKKOS_ENABLE_OPENMPTARGET + case 2: + case 3: { + auto it = exclusive_scan(KE::cbegin(rowFrom), KE::cend(rowFrom), + KE::begin(rowDest), initValue, binaryOp); + const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + + break; + } +#endif + } + +#undef exclusive_scan + } + + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); +} + +template +void run_all_scenarios() { + for (int numTeams : teamSizesToTest) { + for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { +#if not defined KOKKOS_ENABLE_OPENMPTARGET + for (int apiId : {0, 1, 2, 3}) { +#else + for (int apiId : {0, 1}) { +#endif + test_A(numTeams, numCols, apiId); + } + } + } +} + +TEST(std_algorithms_exclusive_scan_team_test, test) { + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); +} + +} // namespace TeamExclusiveScan +} // namespace stdalgos +} // namespace Test diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp new file mode 100644 index 00000000000..9f30812d8ef --- /dev/null +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -0,0 +1,228 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#if not defined KOKKOS_ENABLE_OPENMPTARGET + +namespace Test { +namespace stdalgos { +namespace TeamTransformExclusiveScan { + +namespace KE = Kokkos::Experimental; + +template +struct PlusFunctor { + KOKKOS_INLINE_FUNCTION + ValueType operator()(const ValueType& lhs, const ValueType& rhs) const { + return lhs + rhs; + } +}; + +template +struct MultipliesByTwoFunctor { + KOKKOS_INLINE_FUNCTION + ValueType operator()(const ValueType& value) const { return value * 2; } +}; + +template +struct TestFunctorA { + SourceViewType m_sourceView; + DestViewType m_destView; + DistancesViewType m_distancesView; + IntraTeamSentinelView m_intraTeamSentinelView; + InitValuesViewType m_initValuesView; + BinaryOpType m_binaryOp; + UnaryOpType m_unaryOp; + int m_apiPick; + + TestFunctorA(const SourceViewType sourceView, const DestViewType destView, + const DistancesViewType distancesView, + const IntraTeamSentinelView intraTeamSentinelView, + const InitValuesViewType initValuesView, BinaryOpType binaryOp, + UnaryOpType unaryOp, int apiPick) + : m_sourceView(sourceView), + m_destView(destView), + m_distancesView(distancesView), + m_intraTeamSentinelView(intraTeamSentinelView), + m_initValuesView(initValuesView), + m_binaryOp(binaryOp), + m_unaryOp(unaryOp), + m_apiPick(apiPick) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const { + const auto rowIndex = member.league_rank(); + + auto rowViewSrc = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL()); + auto rowViewDest = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL()); + const auto initVal = m_initValuesView(rowIndex); + ptrdiff_t resultDist = 0; + + switch (m_apiPick) { + case 0: { + auto it = KE::transform_exclusive_scan( + member, KE::cbegin(rowViewSrc), KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal, m_binaryOp, m_unaryOp); + resultDist = KE::distance(KE::begin(rowViewDest), it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 1: { + auto it = KE::transform_exclusive_scan(member, rowViewSrc, rowViewDest, + initVal, m_binaryOp, m_unaryOp); + resultDist = KE::distance(KE::begin(rowViewDest), it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + } + + // store result of checking if all members have their local + // values matching the one stored in m_distancesView + member.team_barrier(); + const bool intraTeamCheck = team_members_have_matching_result( + member, resultDist, m_distancesView(rowIndex)); + Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { + m_intraTeamSentinelView(rowIndex) = intraTeamCheck; + }); + } +}; + +template +void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { + /* description: + use a rank-2 view randomly filled with values, + and run a team-level transform_exclusive_scan + */ + + // ----------------------------------------------- + // prepare data + // ----------------------------------------------- + // create a view in the memory space associated with default exespace + // with as many rows as the number of teams and fill it with random + // values from an arbitrary range. + constexpr ValueType lowerBound = 5; + constexpr ValueType upperBound = 523; + const auto bounds = make_bounds(lowerBound, upperBound); + + auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone( + LayoutTag{}, numTeams, numCols, bounds, "sourceView"); + + // ----------------------------------------------- + // launch kokkos kernel + // ----------------------------------------------- + using space_t = Kokkos::DefaultExecutionSpace; + Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + + // tranform_exclusive_scan returns an iterator so to verify that it is correct + // each team stores the distance of the returned iterator from the beginning + // of the interval that team operates on and then we check that these + // distances match the std result + Kokkos::View distancesView("distancesView", numTeams); + // sentinel to check if all members of the team compute the same result + Kokkos::View intraTeamSentinelView("intraTeamSameResult", numTeams); + + PlusFunctor binaryOp; + MultipliesByTwoFunctor unaryOp; + + // Create view of reduce init values to be used by test cases + Kokkos::View initValuesView_h( + "initValuesView_h", numTeams); + using rand_pool = + Kokkos::Random_XorShift64_Pool; + rand_pool pool(lowerBound * upperBound); + Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); + + // use CTAD for functor + auto initValuesView = + Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + + // ----------------------------------------------- + // run cpp-std kernel and check + // ----------------------------------------------- + auto distancesView_h = create_host_space_copy(distancesView); + auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); + Kokkos::View stdDestView("stdDestView", + numTeams, numCols); + + for (std::size_t i = 0; i < sourceView.extent(0); ++i) { + auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); + auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); + auto initValue = initValuesView_h(i); + ASSERT_TRUE(intraTeamSentinelView_h(i)); + +// libstdc++ as provided by GCC 8 does not have transform_exclusive_scan and +// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy +#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9) +#define transform_exclusive_scan testing_transform_exclusive_scan +#else +#define transform_exclusive_scan std::transform_exclusive_scan +#endif + + switch (apiId) { + case 0: + case 1: { + auto it = transform_exclusive_scan( + KE::cbegin(rowFrom), KE::cend(rowFrom), KE::begin(rowDest), + initValue, binaryOp, unaryOp); + const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + break; + } + } + +#undef transform_exclusive_scan + } + + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); +} + +template +void run_all_scenarios() { + for (int numTeams : teamSizesToTest) { + for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { + for (int apiId : {0, 1}) { + test_A(numTeams, numCols, apiId); + } + } + } +} + +TEST(std_algorithms_transform_exclusive_scan_team_test, test) { + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); +} + +} // namespace TeamTransformExclusiveScan +} // namespace stdalgos +} // namespace Test + +#endif diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index 75525b3b0f9..9dac3ce75ff 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -295,7 +295,7 @@ TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan_functor) { Kokkos::Experimental::Impl::StdNumericScanIdentityReferenceUnaryFunctor< int>; using functor_type = - Kokkos::Experimental::Impl::TransformExclusiveScanFunctor< + Kokkos::Experimental::Impl::TransformExclusiveScanFunctorWithValueWrapper< exespace, int, int, view_type, view_type, MultiplyFunctor, unary_op_type>; functor_type functor(dummy, dummy_view, dummy_view, {}, {}); From 2075ae79b06f10b228b0542125c9a5c287af2f13 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Wed, 4 Oct 2023 11:45:57 -0600 Subject: [PATCH 028/432] core/src: Add half single and double mixed compare (LT,GT,LE,GE) (#6407) * core/src: Add half single and double mixed compare (LT,GT,LE,GE) * Implement PR feedback: - Check whether T is convertible to float - Try upcasting floating_point_wrapper to float and relying on the toolchains implicit upcasting to kick in - Try comparing impl_type with T if impl_type is a full type * Add missing endif * Add missing ifdefs * Update core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp Co-authored-by: Daniel Arndt * Remove HALF_IS_FULL_TYPE branch --------- Co-authored-by: Daniel Arndt --- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 72 ++++++++++++ core/unit_test/TestHalfOperators.hpp | 103 +++++++++++++----- 2 files changed, 148 insertions(+), 27 deletions(-) diff --git a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index e30949431f8..b1ff643a71e 100644 --- a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -839,6 +839,24 @@ class alignas(FloatType) floating_point_wrapper { return tmp_lhs < tmp_rhs; } + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator<(floating_point_wrapper lhs, T rhs) { + return static_cast(lhs) < rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator<(T lhs, floating_point_wrapper rhs) { + return lhs < static_cast(rhs); + } + KOKKOS_FUNCTION friend bool operator>(const volatile floating_point_wrapper& lhs, const volatile floating_point_wrapper& rhs) { @@ -846,6 +864,24 @@ class alignas(FloatType) floating_point_wrapper { return tmp_lhs > tmp_rhs; } + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator>(floating_point_wrapper lhs, T rhs) { + return static_cast(lhs) > rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator>(T lhs, floating_point_wrapper rhs) { + return lhs > static_cast(rhs); + } + KOKKOS_FUNCTION friend bool operator<=(const volatile floating_point_wrapper& lhs, const volatile floating_point_wrapper& rhs) { @@ -853,6 +889,24 @@ class alignas(FloatType) floating_point_wrapper { return tmp_lhs <= tmp_rhs; } + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator<=(floating_point_wrapper lhs, T rhs) { + return static_cast(lhs) <= rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator<=(T lhs, floating_point_wrapper rhs) { + return lhs <= static_cast(rhs); + } + KOKKOS_FUNCTION friend bool operator>=(const volatile floating_point_wrapper& lhs, const volatile floating_point_wrapper& rhs) { @@ -860,6 +914,24 @@ class alignas(FloatType) floating_point_wrapper { return tmp_lhs >= tmp_rhs; } + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator>=(floating_point_wrapper lhs, T rhs) { + return static_cast(lhs) >= rhs; + } + + template + KOKKOS_FUNCTION friend std::enable_if_t && + (std::is_same_v || + std::is_same_v), + bool> + operator>=(T lhs, floating_point_wrapper rhs) { + return lhs >= static_cast(rhs); + } + // Insertion and extraction operators friend std::ostream& operator<<(std::ostream& os, const floating_point_wrapper& x) { diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp index 29844a3c6aa..bf7013cf738 100644 --- a/core/unit_test/TestHalfOperators.hpp +++ b/core/unit_test/TestHalfOperators.hpp @@ -241,10 +241,27 @@ enum OP_TESTS { OR, EQ, NEQ, - LT, - GT, - LE, - GE, // TODO: TW, + LT_H_H, + LT_H_S, + LT_S_H, + LT_H_D, + LT_D_H, + GT_H_H, + GT_H_S, + GT_S_H, + GT_H_D, + GT_D_H, + LE_H_H, + LE_H_S, + LE_S_H, + LE_H_D, + LE_D_H, + GE_H_H, + GE_H_S, + GE_S_H, + GE_H_D, + GE_D_H, + // TODO: TW, PASS_BY_REF, AO_IMPL_HALF, AO_HALF_T, @@ -292,20 +309,20 @@ struct Functor_TestHalfVolatileOperators { actual_lhs(ASSIGN) = static_cast(nv_tmp); expected_lhs(ASSIGN) = d_lhs; - actual_lhs(LT) = h_lhs < h_rhs; - expected_lhs(LT) = d_lhs < d_rhs; + actual_lhs(LT_H_H) = h_lhs < h_rhs; + expected_lhs(LT_H_H) = d_lhs < d_rhs; - actual_lhs(LE) = h_lhs <= h_rhs; - expected_lhs(LE) = d_lhs <= d_rhs; + actual_lhs(LE_H_H) = h_lhs <= h_rhs; + expected_lhs(LE_H_H) = d_lhs <= d_rhs; actual_lhs(NEQ) = h_lhs != h_rhs; expected_lhs(NEQ) = d_lhs != d_rhs; - actual_lhs(GT) = h_lhs > h_rhs; - expected_lhs(GT) = d_lhs > d_rhs; + actual_lhs(GT_H_H) = h_lhs > h_rhs; + expected_lhs(GT_H_H) = d_lhs > d_rhs; - actual_lhs(GE) = h_lhs >= h_rhs; - expected_lhs(GE) = d_lhs >= d_rhs; + actual_lhs(GE_H_H) = h_lhs >= h_rhs; + expected_lhs(GE_H_H) = d_lhs >= d_rhs; actual_lhs(EQ) = h_lhs == h_rhs; expected_lhs(EQ) = d_lhs == d_rhs; @@ -879,17 +896,49 @@ struct Functor_TestHalfOperators { actual_lhs(NEQ) = h_lhs != h_rhs; expected_lhs(NEQ) = d_lhs != d_rhs; - actual_lhs(LT) = h_lhs < h_rhs; - expected_lhs(LT) = d_lhs < d_rhs; - - actual_lhs(GT) = h_lhs > h_rhs; - expected_lhs(GT) = d_lhs > d_rhs; - - actual_lhs(LE) = h_lhs <= h_rhs; - expected_lhs(LE) = d_lhs <= d_rhs; - - actual_lhs(GE) = h_lhs >= h_rhs; - expected_lhs(GE) = d_lhs >= d_rhs; + actual_lhs(LT_H_H) = h_lhs < h_rhs; + expected_lhs(LT_H_H) = d_lhs < d_rhs; + actual_lhs(LT_H_S) = h_lhs < static_cast(h_rhs); + expected_lhs(LT_H_S) = d_lhs < d_rhs; + actual_lhs(LT_S_H) = static_cast(h_lhs) < h_rhs; + expected_lhs(LT_S_H) = d_lhs < d_rhs; + actual_lhs(LT_H_D) = h_lhs < static_cast(h_rhs); + expected_lhs(LT_H_D) = d_lhs < d_rhs; + actual_lhs(LT_D_H) = static_cast(h_lhs) < h_rhs; + expected_lhs(LT_D_H) = d_lhs < d_rhs; + + actual_lhs(GT_H_H) = h_lhs > h_rhs; + expected_lhs(GT_H_H) = d_lhs > d_rhs; + actual_lhs(GT_H_S) = h_lhs > static_cast(h_rhs); + expected_lhs(GT_H_S) = d_lhs > d_rhs; + actual_lhs(GT_S_H) = static_cast(h_lhs) > h_rhs; + expected_lhs(GT_S_H) = d_lhs > d_rhs; + actual_lhs(GT_H_D) = h_lhs > static_cast(h_rhs); + expected_lhs(GT_H_D) = d_lhs > d_rhs; + actual_lhs(GT_D_H) = static_cast(h_lhs) > h_rhs; + expected_lhs(GT_D_H) = d_lhs > d_rhs; + + actual_lhs(LE_H_H) = h_lhs <= h_rhs; + expected_lhs(LE_H_H) = d_lhs <= d_rhs; + actual_lhs(LE_H_S) = h_lhs <= static_cast(h_rhs); + expected_lhs(LE_H_S) = d_lhs <= d_rhs; + actual_lhs(LE_S_H) = static_cast(h_lhs) <= h_rhs; + expected_lhs(LE_S_H) = d_lhs <= d_rhs; + actual_lhs(LE_H_D) = h_lhs <= static_cast(h_rhs); + expected_lhs(LE_H_D) = d_lhs <= d_rhs; + actual_lhs(LE_D_H) = static_cast(h_lhs) <= h_rhs; + expected_lhs(LE_D_H) = d_lhs <= d_rhs; + + actual_lhs(GE_H_H) = h_lhs >= h_rhs; + expected_lhs(GE_H_H) = d_lhs >= d_rhs; + actual_lhs(GE_H_S) = h_lhs >= static_cast(h_rhs); + expected_lhs(GE_H_S) = d_lhs >= d_rhs; + actual_lhs(GE_S_H) = static_cast(h_lhs) >= h_rhs; + expected_lhs(GE_S_H) = d_lhs >= d_rhs; + actual_lhs(GE_H_D) = h_lhs >= static_cast(h_rhs); + expected_lhs(GE_H_D) = d_lhs >= d_rhs; + actual_lhs(GE_D_H) = static_cast(h_lhs) >= h_rhs; + expected_lhs(GE_D_H) = d_lhs >= d_rhs; // actual_lhs(TW) = h_lhs <=> h_rhs; // Need C++20? // expected_lhs(TW) = d_lhs <=> d_rhs; // Need C++20? @@ -961,10 +1010,10 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { // printf("op_test = %d\n", op_test); - if (op_test == ASSIGN || op_test == LT || op_test == LE || op_test == NEQ || - op_test == EQ || op_test == GT || op_test == GE || - op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H || - op_test == CDIV_H_H) { + if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H || + op_test == NEQ || op_test == EQ || op_test == GT_H_H || + op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || + op_test == CMUL_H_H || op_test == CDIV_H_H) { ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), epsilon); ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), From 0bf937cdd8cbfa172e93297d81d220dd6d94d87d Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Wed, 4 Oct 2023 13:08:47 -0500 Subject: [PATCH 029/432] Moving abort and assert into their own public headers (#6445) Moving abort and assert into their own public headers --- Makefile.targets | 2 + core/src/Kokkos_Abort.hpp | 105 +++++++++++++++++++++++++ core/src/Kokkos_Assert.hpp | 70 +++++++++++++++++ core/src/impl/Kokkos_Abort.cpp | 44 +++++++++++ core/src/impl/Kokkos_Error.cpp | 18 +---- core/src/impl/Kokkos_Error.hpp | 140 +-------------------------------- 6 files changed, 224 insertions(+), 155 deletions(-) create mode 100644 core/src/Kokkos_Abort.hpp create mode 100644 core/src/Kokkos_Assert.hpp create mode 100644 core/src/impl/Kokkos_Abort.cpp diff --git a/Makefile.targets b/Makefile.targets index 4e08a46c695..ec8770dd7de 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -36,6 +36,8 @@ Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp +Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort.cpp ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp diff --git a/core/src/Kokkos_Abort.hpp b/core/src/Kokkos_Abort.hpp new file mode 100644 index 00000000000..5639933ecb0 --- /dev/null +++ b/core/src/Kokkos_Abort.hpp @@ -0,0 +1,105 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ABORT_HPP +#define KOKKOS_ABORT_HPP + +#include +#include +#ifdef KOKKOS_ENABLE_CUDA +#include +#endif +#ifdef KOKKOS_ENABLE_HIP +#include +#endif +#ifdef KOKKOS_ENABLE_SYCL +#include +#endif + +namespace Kokkos { +namespace Impl { + +[[noreturn]] void host_abort(const char *const); + +#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) +// required to workaround failures in random number generator unit tests with +// pre-volta architectures +#define KOKKOS_IMPL_ABORT_NORETURN +#else +// cuda_abort aborts when building for other platforms than macOS +#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] +#endif + +#elif defined(KOKKOS_COMPILER_NVHPC) + +#define KOKKOS_IMPL_ABORT_NORETURN + +#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__) +// HIP aborts +#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] +#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL SYCL doesn't abort +#define KOKKOS_IMPL_ABORT_NORETURN +#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) +// Host aborts +#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] +#else +// Everything else does not abort +#define KOKKOS_IMPL_ABORT_NORETURN +#endif + +// FIXME_SYCL +// Accomodate host pass for device functions that are not [[noreturn]] +#if defined(KOKKOS_ENABLE_SYCL) || \ + (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)) +#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE +#else +#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE KOKKOS_IMPL_ABORT_NORETURN +#endif + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ + defined(KOKKOS_ENABLE_OPENACC) +KOKKOS_IMPL_ABORT_NORETURN_DEVICE inline KOKKOS_IMPL_DEVICE_FUNCTION void +device_abort(const char *const msg) { +#if defined(KOKKOS_ENABLE_CUDA) + ::Kokkos::Impl::cuda_abort(msg); +#elif defined(KOKKOS_ENABLE_HIP) + ::Kokkos::Impl::hip_abort(msg); +#elif defined(KOKKOS_ENABLE_SYCL) + ::Kokkos::Impl::sycl_abort(msg); +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC) + printf("%s", msg); // FIXME_OPENMPTARGET FIXME_OPENACC +#else +#error faulty logic +#endif +} +#endif +} // namespace Impl + +KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort( + const char *const message) { + KOKKOS_IF_ON_HOST(::Kokkos::Impl::host_abort(message);) + KOKKOS_IF_ON_DEVICE(::Kokkos::Impl::device_abort(message);) +} + +#undef KOKKOS_IMPL_ABORT_NORETURN + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_ABORT_HPP */ diff --git a/core/src/Kokkos_Assert.hpp b/core/src/Kokkos_Assert.hpp new file mode 100644 index 00000000000..c3b9004734a --- /dev/null +++ b/core/src/Kokkos_Assert.hpp @@ -0,0 +1,70 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ASSERT_HPP +#define KOKKOS_ASSERT_HPP + +#include +#include + +#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ + defined(KOKKOS_ENABLE_DEBUG) +#define KOKKOS_EXPECTS(...) \ + { \ + if (!bool(__VA_ARGS__)) { \ + ::Kokkos::abort( \ + "Kokkos contract violation:\n " \ + " Expected precondition `" #__VA_ARGS__ \ + "` evaluated false.\n" \ + "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \ + __LINE__) " \n"); \ + } \ + } +#define KOKKOS_ENSURES(...) \ + { \ + if (!bool(__VA_ARGS__)) { \ + ::Kokkos::abort( \ + "Kokkos contract violation:\n " \ + " Ensured postcondition `" #__VA_ARGS__ \ + "` evaluated false.\n" \ + "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \ + __LINE__) " \n"); \ + } \ + } +// some projects already define this for themselves, so don't mess +// them up +#ifndef KOKKOS_ASSERT +#define KOKKOS_ASSERT(...) \ + { \ + if (!bool(__VA_ARGS__)) { \ + ::Kokkos::abort( \ + "Kokkos contract violation:\n " \ + " Asserted condition `" #__VA_ARGS__ \ + "` evaluated false.\n" \ + "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \ + __LINE__) " \n"); \ + } \ + } +#endif // ifndef KOKKOS_ASSERT +#else // not debug mode +#define KOKKOS_EXPECTS(...) +#define KOKKOS_ENSURES(...) +#ifndef KOKKOS_ASSERT +#define KOKKOS_ASSERT(...) +#endif // ifndef KOKKOS_ASSERT +#endif // end debug mode ifdefs + +#endif /* #ifndef KOKKOS_ASSERT_HPP */ diff --git a/core/src/impl/Kokkos_Abort.cpp b/core/src/impl/Kokkos_Abort.cpp new file mode 100644 index 00000000000..23f663e37fb --- /dev/null +++ b/core/src/impl/Kokkos_Abort.cpp @@ -0,0 +1,44 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +void host_abort(const char *const message) { + std::cerr << message; + +#ifdef KOKKOS_IMPL_ENABLE_STACKTRACE + std::cerr << "\nBacktrace:\n"; + save_stacktrace(); + print_demangled_saved_stacktrace(std::cerr); +#else + std::cerr << "\nTraceback functionality not available\n"; +#endif + + ::abort(); +} + +} // namespace Impl +} // namespace Kokkos diff --git a/core/src/impl/Kokkos_Error.cpp b/core/src/impl/Kokkos_Error.cpp index b6376bf5778..4babe2d72bd 100644 --- a/core/src/impl/Kokkos_Error.cpp +++ b/core/src/impl/Kokkos_Error.cpp @@ -21,12 +21,11 @@ #include #include -#include +#include #include #include #include #include -#include #include //---------------------------------------------------------------------------- @@ -34,26 +33,11 @@ namespace Kokkos { namespace Impl { -void traceback_callstack(std::ostream &msg) { -#ifdef KOKKOS_IMPL_ENABLE_STACKTRACE - msg << "\nBacktrace:\n"; - save_stacktrace(); - print_demangled_saved_stacktrace(msg); -#else - msg << "\nTraceback functionality not available\n"; -#endif -} void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } -void host_abort(const char *const message) { - std::cerr << message; - traceback_callstack(std::cerr); - ::abort(); -} - std::string human_memory_size(size_t arg_bytes) { double bytes = arg_bytes; const double K = 1024; diff --git a/core/src/impl/Kokkos_Error.hpp b/core/src/impl/Kokkos_Error.hpp index e4bfaf189e2..3d0b1d3274c 100644 --- a/core/src/impl/Kokkos_Error.hpp +++ b/core/src/impl/Kokkos_Error.hpp @@ -20,82 +20,14 @@ #include #include #include -#ifdef KOKKOS_ENABLE_CUDA -#include -#endif -#ifdef KOKKOS_ENABLE_HIP -#include -#endif -#ifdef KOKKOS_ENABLE_SYCL -#include -#endif +#include +#include namespace Kokkos { namespace Impl { -[[noreturn]] void host_abort(const char *const); - -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) -// required to workaround failures in random number generator unit tests with -// pre-volta architectures -#define KOKKOS_IMPL_ABORT_NORETURN -#else -// cuda_abort aborts when building for other platforms than macOS -#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] -#endif - -#elif defined(KOKKOS_COMPILER_NVHPC) - -#define KOKKOS_IMPL_ABORT_NORETURN - -#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__) -// HIP aborts -#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] -#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__) -// FIXME_SYCL SYCL doesn't abort -#define KOKKOS_IMPL_ABORT_NORETURN -#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) -// Host aborts -#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] -#else -// Everything else does not abort -#define KOKKOS_IMPL_ABORT_NORETURN -#endif - -// FIXME_SYCL -// Accomodate host pass for device functions that are not [[noreturn]] -#if defined(KOKKOS_ENABLE_SYCL) || \ - (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)) -#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE -#else -#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE KOKKOS_IMPL_ABORT_NORETURN -#endif - -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ - defined(KOKKOS_ENABLE_OPENACC) -KOKKOS_IMPL_ABORT_NORETURN_DEVICE inline KOKKOS_IMPL_DEVICE_FUNCTION void -device_abort(const char *const msg) { -#if defined(KOKKOS_ENABLE_CUDA) - ::Kokkos::Impl::cuda_abort(msg); -#elif defined(KOKKOS_ENABLE_HIP) - ::Kokkos::Impl::hip_abort(msg); -#elif defined(KOKKOS_ENABLE_SYCL) - ::Kokkos::Impl::sycl_abort(msg); -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC) - printf("%s", msg); // FIXME_OPENMPTARGET FIXME_OPENACC -#else -#error faulty logic -#endif -} -#endif - [[noreturn]] void throw_runtime_exception(const std::string &msg); -void traceback_callstack(std::ostream &); - std::string human_memory_size(size_t arg_bytes); } // namespace Impl @@ -195,72 +127,4 @@ class RawMemoryAllocationFailure : public std::bad_alloc { } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort( - const char *const message) { - KOKKOS_IF_ON_HOST(::Kokkos::Impl::host_abort(message);) - KOKKOS_IF_ON_DEVICE(::Kokkos::Impl::device_abort(message);) -} - -#undef KOKKOS_IMPL_ABORT_NORETURN - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ - defined(KOKKOS_ENABLE_DEBUG) -#define KOKKOS_EXPECTS(...) \ - { \ - if (!bool(__VA_ARGS__)) { \ - ::Kokkos::abort( \ - "Kokkos contract violation:\n " \ - " Expected precondition `" #__VA_ARGS__ \ - "` evaluated false.\n" \ - "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \ - __LINE__) " \n"); \ - } \ - } -#define KOKKOS_ENSURES(...) \ - { \ - if (!bool(__VA_ARGS__)) { \ - ::Kokkos::abort( \ - "Kokkos contract violation:\n " \ - " Ensured postcondition `" #__VA_ARGS__ \ - "` evaluated false.\n" \ - "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \ - __LINE__) " \n"); \ - } \ - } -// some projects already define this for themselves, so don't mess -// them up -#ifndef KOKKOS_ASSERT -#define KOKKOS_ASSERT(...) \ - { \ - if (!bool(__VA_ARGS__)) { \ - ::Kokkos::abort( \ - "Kokkos contract violation:\n " \ - " Asserted condition `" #__VA_ARGS__ \ - "` evaluated false.\n" \ - "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \ - __LINE__) " \n"); \ - } \ - } -#endif // ifndef KOKKOS_ASSERT -#else // not debug mode -#define KOKKOS_EXPECTS(...) -#define KOKKOS_ENSURES(...) -#ifndef KOKKOS_ASSERT -#define KOKKOS_ASSERT(...) -#endif // ifndef KOKKOS_ASSERT -#endif // end debug mode ifdefs - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - #endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */ From c63f125ec12319ce58d3b6a43498ff0ce016c903 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 18 Jul 2023 15:11:06 +0200 Subject: [PATCH 030/432] Add test for parallel_scan with return value for ThreadVectorRange --- core/unit_test/TestTeamVector.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 1a0a1798a56..5a0475a2058 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -607,9 +607,8 @@ struct functor_vec_scan { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ - !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HPX) template struct functor_vec_scan_ret_val { using policy_type = Kokkos::TeamPolicy; @@ -736,9 +735,8 @@ bool test_scalar(int nteams, int team_size, int test) { } else if (test == 12) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ - !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HPX) Kokkos::parallel_for( Kokkos::TeamPolicy(nteams, team_size, 8), functor_vec_scan_ret_val(d_flag, team_size)); From 495b1ccfd8eb34a88edc9faa5c35bb55fce69e15 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Wed, 19 Jul 2023 12:20:11 +0200 Subject: [PATCH 031/432] Add parallel_scan overloads with value for Threads --- core/src/Threads/Kokkos_ThreadsTeam.hpp | 27 +++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/core/src/Threads/Kokkos_ThreadsTeam.hpp b/core/src/Threads/Kokkos_ThreadsTeam.hpp index de48b70cc8a..b1cadc7c485 100644 --- a/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -1038,17 +1038,20 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( * final==true. Scan_val will be set to the final sum value over all vector * lanes. */ -template +template KOKKOS_INLINE_FUNCTION void parallel_scan( const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, - const FunctorType& lambda) { - using value_type = + const FunctorType& lambda, ValueType& return_val) { + // Extract ValueType from the Closure + using closure_value_type = typename Impl::FunctorAnalysis, FunctorType, void>::value_type; + static_assert(std::is_same::value, + "Non-matching value types of closure and return type"); - value_type scan_val = value_type(); + ValueType scan_val = ValueType(); #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -1057,6 +1060,22 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( i += loop_boundaries.increment) { lambda(i, scan_val, true); } + + return_val = scan_val; +} + +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const FunctorType& lambda) { + using value_type = + typename Impl::FunctorAnalysis, FunctorType, + void>::value_type; + + value_type scan_val; + parallel_scan(loop_boundaries, lambda, scan_val); } /** \brief Intra-thread vector parallel scan with reducer From 4ce289baa53fa131c558a4100efe0a3f052415a8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 4 Oct 2023 14:25:05 -0400 Subject: [PATCH 032/432] Allow detecting SIMD types based on compiler macros (#6188) Allow detecting SIMD types based on compiler macros --- cmake/kokkos_arch.cmake | 62 ++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 2448811875f..387fd3fb803 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -418,10 +418,7 @@ IF (KOKKOS_ARCH_SKL) ENDIF() IF (KOKKOS_ARCH_SKX) - # FIXME_NVHPC nvc++ doesn't seem to support AVX512. - IF (NOT KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON ON) - ENDIF() + SET(KOKKOS_ARCH_AVX512XEON ON) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Cray NO-VALUE-SPECIFIED @@ -433,10 +430,7 @@ IF (KOKKOS_ARCH_SKX) ENDIF() IF (KOKKOS_ARCH_ICL) - # FIXME_NVHPC nvc++ doesn't seem to support AVX512. - IF (NOT KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON ON) - ENDIF() + SET(KOKKOS_ARCH_AVX512XEON ON) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC /arch:AVX512 @@ -445,10 +439,7 @@ IF (KOKKOS_ARCH_ICL) ENDIF() IF (KOKKOS_ARCH_ICX) - # FIXME_NVHPC nvc++ doesn't seem to support AVX512. - IF (NOT KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON ON) - ENDIF() + SET(KOKKOS_ARCH_AVX512XEON ON) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC /arch:AVX512 @@ -457,10 +448,7 @@ IF (KOKKOS_ARCH_ICX) ENDIF() IF (KOKKOS_ARCH_SPR) - # FIXME_NVHPC nvc++ doesn't seem to support AVX512. - IF (NOT KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON ON) - ENDIF() + SET(KOKKOS_ARCH_AVX512XEON ON) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC /arch:AVX512 @@ -495,6 +483,48 @@ IF (KOKKOS_ARCH_POWER9) ) ENDIF() +# If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect +# the SIMD capabilities based on compiler macros. +IF (KOKKOS_ARCH_NATIVE) + # Make sure to rerun the checks if compile options have changed + IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") + SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") + + SET(CMAKE_REQUIRED_QUIET ON) + SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + INCLUDE(CheckCXXSymbolExists) + + UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE) + CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) + UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) + CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) + CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) + SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + + UNSET(CMAKE_REQUIRED_QUIET) + UNSET(CMAKE_REQUIRED_FLAGS) + ENDIF() + + # Only define one of these macros for now + # to be uniform with what we are doing for other architectures. + IF(KOKKOS_COMPILER_HAS_AVX512) + MESSAGE(STATUS "SIMD: AVX512 detected") + SET(KOKKOS_ARCH_AVX512XEON ON) + ELSEIF(KOKKOS_COMPILER_HAS_AVX2) + MESSAGE(STATUS "SIMD: AVX2 detected") + SET(KOKKOS_ARCH_AVX2 ON) + ELSEIF(KOKKOS_COMPILER_HAS_AVX) + MESSAGE(STATUS "SIMD: AVX detected") + SET(KOKKOS_ARCH_AVX ON) + ENDIF() +ENDIF() + +# FIXME_NVHPC nvc++ doesn't seem to support AVX512. +IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) + SET(KOKKOS_ARCH_AVX512XEON OFF) +ENDIF() + IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( From 29d4ffdbffec5d454bbd0ab10e3d3b59669e778d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 4 Oct 2023 14:55:53 -0400 Subject: [PATCH 033/432] Add KOKKOS_ARCH_ARM_NEON --- Makefile.kokkos | 5 +++++ cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 10 ++++++++++ simd/src/Kokkos_SIMD.hpp | 11 +++++++---- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index b172b88641a..489ade2d575 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -708,6 +708,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -720,6 +721,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -732,6 +734,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_A64FX") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") KOKKOS_CXXFLAGS += -march=armv8.2-a+sve KOKKOS_LDFLAGS += -march=armv8.2-a+sve @@ -787,6 +790,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -800,6 +804,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX2") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index a7594f4ccee..bec59ebd034 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -72,6 +72,7 @@ #cmakedefine KOKKOS_ARCH_AVX #cmakedefine KOKKOS_ARCH_AVX2 #cmakedefine KOKKOS_ARCH_AVX512XEON +#cmakedefine KOKKOS_ARCH_ARM_NEON #cmakedefine KOKKOS_ARCH_KNC #cmakedefine KOKKOS_ARCH_AVX512MIC #cmakedefine KOKKOS_ARCH_POWER7 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 387fd3fb803..bccf674d763 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -249,6 +249,7 @@ IF(KOKKOS_ARCH_NATIVE) ENDIF() IF (KOKKOS_ARCH_ARMV80) + SET(KOKKOS_ARCH_ARM_NEON ON) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Cray NO-VALUE-SPECIFIED @@ -259,6 +260,7 @@ IF (KOKKOS_ARCH_ARMV80) ENDIF() IF (KOKKOS_ARCH_ARMV81) + SET(KOKKOS_ARCH_ARM_NEON ON) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Cray NO-VALUE-SPECIFIED @@ -269,6 +271,7 @@ IF (KOKKOS_ARCH_ARMV81) ENDIF() IF (KOKKOS_ARCH_ARMV8_THUNDERX) + SET(KOKKOS_ARCH_ARM_NEON ON) SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID @@ -280,6 +283,7 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX) ENDIF() IF (KOKKOS_ARCH_ARMV8_THUNDERX2) + SET(KOKKOS_ARCH_ARM_NEON ON) SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID @@ -291,6 +295,7 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX2) ENDIF() IF (KOKKOS_ARCH_A64FX) + SET(KOKKOS_ARCH_ARM_NEON ON) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -march=armv8.2-a+sve -msve-vector-bits=512 @@ -498,6 +503,8 @@ IF (KOKKOS_ARCH_NATIVE) CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") @@ -514,6 +521,9 @@ IF (KOKKOS_ARCH_NATIVE) ELSEIF(KOKKOS_COMPILER_HAS_AVX2) MESSAGE(STATUS "SIMD: AVX2 detected") SET(KOKKOS_ARCH_AVX2 ON) + ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON) + MESSAGE(STATUS "SIMD: ARM_NEON detected") + SET(KOKKOS_ARCH_ARM_NEON ON) ELSEIF(KOKKOS_COMPILER_HAS_AVX) MESSAGE(STATUS "SIMD: AVX detected") SET(KOKKOS_ARCH_AVX ON) diff --git a/simd/src/Kokkos_SIMD.hpp b/simd/src/Kokkos_SIMD.hpp index 794c56fbdb2..2bb551fdbfe 100644 --- a/simd/src/Kokkos_SIMD.hpp +++ b/simd/src/Kokkos_SIMD.hpp @@ -40,7 +40,7 @@ #include #endif -#ifdef __ARM_NEON +#if defined(KOKKOS_ARCH_ARM_NEON) #include #endif #else // KOKKOS_ENABLE_OPENMPTARGET @@ -62,7 +62,10 @@ #include #endif -#ifdef __ARM_NEON +#if defined(KOKKOS_ARCH_ARM_NEON) +#if !defined(__ARM_NEON) +#error "__ARM_NEON must be definded for KOKKOS_ARCH_ARM_NEON" +#endif #include #endif #endif @@ -82,7 +85,7 @@ namespace Impl { using host_native = avx512_fixed_size<8>; #elif defined(KOKKOS_ARCH_AVX2) using host_native = avx2_fixed_size<4>; -#elif defined(__ARM_NEON) +#elif defined(KOKKOS_ARCH_ARM_NEON) using host_native = neon_fixed_size<2>; #else using host_native = scalar; @@ -185,7 +188,7 @@ using data_type_set = data_types>; using data_type_set = data_types; -#elif defined(__ARM_NEON) +#elif defined(KOKKOS_ARCH_ARM_NEON) using host_abi_set = abi_set>; using data_type_set = data_types; From 6ff0deb9b371ef7dc245888bdece2a924990f2c3 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 4 Oct 2023 16:01:33 -0400 Subject: [PATCH 034/432] Fix implementation for cyl_bessel_i0 --- .../Kokkos_MathematicalSpecialFunctions.hpp | 81 ++++++++----------- .../TestMathematicalSpecialFunctions.hpp | 5 +- 2 files changed, 36 insertions(+), 50 deletions(-) diff --git a/core/src/Kokkos_MathematicalSpecialFunctions.hpp b/core/src/Kokkos_MathematicalSpecialFunctions.hpp index 63c2b58ef59..2118a0ad931 100644 --- a/core/src/Kokkos_MathematicalSpecialFunctions.hpp +++ b/core/src/Kokkos_MathematicalSpecialFunctions.hpp @@ -846,69 +846,52 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y1(const CmplxType& z, //! for a complex argument template KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i0(const CmplxType& z, - const RealType& joint_val = 25, - const IntType& bw_start = 70) { + const RealType& joint_val = 18, + const IntType& n_terms = 50) { // This function is converted and modified from the corresponding Fortran - // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special + // programs CIK01 in S. Zhang & J. Jin "Computation of Special // Functions" (Wiley, 1996). // Input : z --- Complex argument // joint_val --- Joint point of abs(z) separating small and large // argument regions - // bw_start --- Starting point for backward recurrence + // n_terms --- Numbers of terms used in the power series // Output: cbi0 --- I0(z) - using Kokkos::numbers::pi_v; - - CmplxType cbi0; - constexpr auto pi = pi_v; - const RealType a[12] = {0.125, - 7.03125e-2, - 7.32421875e-2, - 1.1215209960938e-1, - 2.2710800170898e-1, - 5.7250142097473e-1, - 1.7277275025845e0, - 6.0740420012735e0, - 2.4380529699556e1, - 1.1001714026925e2, - 5.5133589612202e2, - 3.0380905109224e3}; + CmplxType cbi0(1.0, 0.0); RealType a0 = Kokkos::abs(z); CmplxType z1 = z; - if (a0 < 1e-100) { // Treat z=0 as a special case - cbi0 = CmplxType(1.0, 0.0); - } else { + if (a0 > 1e-100) { if (z.real() < 0.0) z1 = -z; - if (a0 <= joint_val) { // Using backward recurrence for |z|<=joint_val - // (default:25) - CmplxType cbs = CmplxType(0.0, 0.0); - // CmplxType csk0 = CmplxType(0.0,0.0); - CmplxType cf0 = CmplxType(0.0, 0.0); - CmplxType cf1 = CmplxType(1e-100, 0.0); - CmplxType cf, cs0; - for (int k = bw_start; k >= 0; k--) { // Backward recurrence (default: - // 70) - cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0; - if (k == 0) cbi0 = cf; - // if ((k == 2*(k/2)) && (k != 0)) { - // csk0 = csk0+4.0*cf/static_cast(k); - //} - cbs = cbs + 2.0 * cf; - cf0 = cf1; - cf1 = cf; + if (a0 <= joint_val) { + // Using power series definition for |z|<=joint_val (default:18) + CmplxType cr = CmplxType(1.0e+00, 0.0e+00); + CmplxType z2 = z * z; + for (int k = 1; k < n_terms; ++k) { + cr = RealType(.25) * cr * z2 / CmplxType(k * k); + cbi0 += cr; + if (Kokkos::abs(cr / cbi0) < RealType(1.e-15)) continue; } - cs0 = Kokkos::exp(z1) / (cbs - cf); - cbi0 = cbi0 * cs0; - } else { // Using asymptotic expansion (6.2.1) for |z|>joint_val - // (default:25) - CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1); - cbi0 = CmplxType(1.0, 0.0); - CmplxType zr = 1.0 / z1; + } else { + // Using asymptotic expansion (6.2.1) for |z|>joint_val (default:18) + const RealType a[12] = {0.125, + 7.03125e-2, + 7.32421875e-2, + 1.1215209960938e-1, + 2.2710800170898e-1, + 5.7250142097473e-1, + 1.7277275025845e0, + 6.0740420012735e0, + 2.4380529699556e1, + 1.1001714026925e2, + 5.5133589612202e2, + 3.0380905109224e3}; + for (int k = 1; k <= 12; k++) { - cbi0 = cbi0 + a[k - 1] * Kokkos::pow(zr, 1.0 * k); + cbi0 += a[k - 1] * Kokkos::pow(z1, -k); } - cbi0 = ca * cbi0; + cbi0 *= Kokkos::exp(z1) / + Kokkos::sqrt(2.0 * Kokkos::numbers::pi_v * z1); } } return cbi0; diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp index 247f991f810..06c84c75137 100644 --- a/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -1058,7 +1058,7 @@ struct TestComplexBesselI0K0Function { void testit() { using Kokkos::Experimental::infinity; - int N = 25; + int N = 26; d_z = ViewType("d_z", N); d_cbi0 = ViewType("d_cbi0", N); d_cbk0 = ViewType("d_cbk0", N); @@ -1094,6 +1094,7 @@ struct TestComplexBesselI0K0Function { h_z(22) = Kokkos::complex(-28.0, 0.0); h_z(23) = Kokkos::complex(60.0, 0.0); h_z(24) = Kokkos::complex(-60.0, 0.0); + h_z(25) = Kokkos::complex(7.998015e-5, 0.0); Kokkos::deep_copy(d_z, h_z); @@ -1152,6 +1153,7 @@ struct TestComplexBesselI0K0Function { h_ref_cbi0(22) = Kokkos::complex(1.095346047317573e+11, 0); h_ref_cbi0(23) = Kokkos::complex(5.894077055609803e+24, 0); h_ref_cbi0(24) = Kokkos::complex(5.894077055609803e+24, 0); + h_ref_cbi0(25) = Kokkos::complex(1.0000000015992061009, 0); h_ref_cbk0(0) = Kokkos::complex(infinity::value, 0); h_ref_cbk0(1) = @@ -1198,6 +1200,7 @@ struct TestComplexBesselI0K0Function { h_ref_cbk0(23) = Kokkos::complex(1.413897840559108e-27, 0); h_ref_cbk0(24) = Kokkos::complex(1.413897840559108e-27, -1.851678917759592e+25); + h_ref_cbk0(25) = Kokkos::complex(9.5496636116079915979, 0.); // FIXME_HIP Disable the test when using ROCm 5.5 and 5.6 due to a known // compiler bug From 6b21fde9efd2c057f507ac7883eb37073aff910c Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Wed, 4 Oct 2023 13:18:05 +0000 Subject: [PATCH 035/432] cleaning: remove iostream from headers where possible (IWYU) --- containers/src/Kokkos_UnorderedMap.hpp | 2 -- containers/src/impl/Kokkos_Bitset_impl.hpp | 1 - containers/src/impl/Kokkos_UnorderedMap_impl.hpp | 1 - core/perf_test/test_mempool.cpp | 1 + core/perf_test/test_taskdag.cpp | 2 ++ core/src/Kokkos_MemoryPool.hpp | 2 -- core/src/Kokkos_Parallel_Reduce.hpp | 1 - core/src/OpenMP/Kokkos_OpenMP.cpp | 2 ++ core/src/Threads/Kokkos_ThreadsExec.hpp | 3 ++- core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 2 -- core/src/impl/Kokkos_SharedAlloc_timpl.hpp | 6 +++--- example/build_cmake_in_tree/cmake_example.cpp | 2 ++ example/build_cmake_installed/cmake_example.cpp | 2 ++ .../build_cmake_installed_kk_as_language/cmake_example.cpp | 2 ++ 14 files changed, 16 insertions(+), 13 deletions(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index c0ee55887c7..75abaf02e4d 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -35,8 +35,6 @@ #include #include -#include - #include namespace Kokkos { diff --git a/containers/src/impl/Kokkos_Bitset_impl.hpp b/containers/src/impl/Kokkos_Bitset_impl.hpp index c598e4b67b2..b71037afeaa 100644 --- a/containers/src/impl/Kokkos_Bitset_impl.hpp +++ b/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -23,7 +23,6 @@ #include #include -#include #include namespace Kokkos { diff --git a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index e0649f6029c..8f8cd9523b7 100644 --- a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -22,7 +22,6 @@ #include #include -#include #include namespace Kokkos { diff --git a/core/perf_test/test_mempool.cpp b/core/perf_test/test_mempool.cpp index abb6180346a..9905740afb4 100644 --- a/core/perf_test/test_mempool.cpp +++ b/core/perf_test/test_mempool.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/core/perf_test/test_taskdag.cpp b/core/perf_test/test_taskdag.cpp index bbb48af6c43..fccaab64ddf 100644 --- a/core/perf_test/test_taskdag.cpp +++ b/core/perf_test/test_taskdag.cpp @@ -14,6 +14,8 @@ // //@HEADER +#include + #include #if !defined(KOKKOS_ENABLE_TASKDAG) || \ diff --git a/core/src/Kokkos_MemoryPool.hpp b/core/src/Kokkos_MemoryPool.hpp index 6b47c437276..ce8c9e152fa 100644 --- a/core/src/Kokkos_MemoryPool.hpp +++ b/core/src/Kokkos_MemoryPool.hpp @@ -29,8 +29,6 @@ static_assert(false, #include #include -#include - namespace Kokkos { namespace Impl { /* Report violation of size constraints: diff --git a/core/src/Kokkos_Parallel_Reduce.hpp b/core/src/Kokkos_Parallel_Reduce.hpp index 34897ab1ee7..d499eba6dcc 100644 --- a/core/src/Kokkos_Parallel_Reduce.hpp +++ b/core/src/Kokkos_Parallel_Reduce.hpp @@ -27,7 +27,6 @@ static_assert(false, #include #include #include -#include namespace Kokkos { diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index aa185a0bc0d..9a169a435c7 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -18,6 +18,8 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif +#include + #include #include diff --git a/core/src/Threads/Kokkos_ThreadsExec.hpp b/core/src/Threads/Kokkos_ThreadsExec.hpp index 8b15928debc..35e815828a6 100644 --- a/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -20,8 +20,9 @@ #include #include - +#include #include + #include #include diff --git a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp index b3731671276..f740c408fb8 100644 --- a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp +++ b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -21,8 +21,6 @@ #include #include -#include - namespace Kokkos { namespace Impl { diff --git a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp index 9aa96e27d15..d403ef9db06 100644 --- a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp +++ b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -24,9 +24,9 @@ #include // used with HostInaccessible specializations -#include // std::string -#include // strncpy -#include // ostream +#include +#include +#include namespace Kokkos { namespace Impl { diff --git a/example/build_cmake_in_tree/cmake_example.cpp b/example/build_cmake_in_tree/cmake_example.cpp index b345c48f535..2b9a263f87d 100644 --- a/example/build_cmake_in_tree/cmake_example.cpp +++ b/example/build_cmake_in_tree/cmake_example.cpp @@ -15,7 +15,9 @@ //@HEADER #include + #include +#include int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); diff --git a/example/build_cmake_installed/cmake_example.cpp b/example/build_cmake_installed/cmake_example.cpp index ca11250edd8..ba501659791 100644 --- a/example/build_cmake_installed/cmake_example.cpp +++ b/example/build_cmake_installed/cmake_example.cpp @@ -15,7 +15,9 @@ //@HEADER #include + #include +#include extern "C" void print_fortran_(); diff --git a/example/build_cmake_installed_kk_as_language/cmake_example.cpp b/example/build_cmake_installed_kk_as_language/cmake_example.cpp index c7f24bd5ab6..c71d75eeb8c 100644 --- a/example/build_cmake_installed_kk_as_language/cmake_example.cpp +++ b/example/build_cmake_installed_kk_as_language/cmake_example.cpp @@ -15,7 +15,9 @@ //@HEADER #include + #include +#include extern "C" void print_fortran_(); void print_cxx(); From 96edf73bd3a3431052844a6ff38f233f2c9e2a16 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 5 Oct 2023 11:15:26 -0400 Subject: [PATCH 036/432] Fix compiling SIMD unit tests on NVIDIA --- simd/src/Kokkos_SIMD_AVX2.hpp | 2 +- simd/src/Kokkos_SIMD_Common.hpp | 52 ++++++++++--------- simd/unit_tests/include/SIMDTesting_Ops.hpp | 6 +-- .../include/TestSIMD_GeneratorCtors.hpp | 31 +++++++---- 4 files changed, 52 insertions(+), 39 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index c7c66d83aff..cf9894ed2c4 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -758,7 +758,7 @@ class simd> { std::is_invocable_r_v>, bool> = false> - KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) : m_value(_mm_setr_ps(gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index c6fd16ca686..cb0879338e4 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -381,7 +381,7 @@ template #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 namespace Experimental { template -[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd min(Experimental::simd const& a, Experimental::simd const& b) { @@ -403,7 +403,7 @@ template #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 namespace Experimental { template -[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd max(Experimental::simd const& a, Experimental::simd const& b) { @@ -431,8 +431,9 @@ template } \ namespace Experimental { \ template \ - [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION simd \ - FUNC(simd const& a) { \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ + simd \ + FUNC(simd const& a) { \ return Kokkos::FUNC(a); \ } \ } @@ -488,8 +489,9 @@ KOKKOS_IMPL_SIMD_UNARY_FUNCTION(lgamma) } \ namespace Experimental { \ template \ - [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION simd \ - FUNC(simd const& a, simd const& b) { \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ + simd \ + FUNC(simd const& a, simd const& b) { \ Kokkos::FUNC(a, b); \ } \ } @@ -513,24 +515,26 @@ KOKKOS_IMPL_SIMD_BINARY_FUNCTION(atan2) KOKKOS_IMPL_SIMD_BINARY_FUNCTION(copysign) #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ - Experimental::simd const& a, \ - Experimental::simd const& b, \ - Experimental::simd const& c) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ - result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ - } \ - return result; \ - } \ - namespace Experimental { \ - template \ - [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION simd \ - FUNC(simd const& a, simd const& b, simd const& c) { \ - return Kokkos::FUNC(a, b, c); \ - } \ +#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b, \ + Experimental::simd const& c) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ + } \ + return result; \ + } \ + namespace Experimental { \ + template \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ + simd \ + FUNC(simd const& a, simd const& b, \ + simd const& c) { \ + return Kokkos::FUNC(a, b, c); \ + } \ } #else #define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index 15813a963c4..02af4f82eb8 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -80,7 +80,7 @@ class absolutes { template auto on_host(T const& a) const { if constexpr (std::is_signed_v) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) && !defined(KOKKOS_COMPILER_NVCC) return Kokkos::Experimental::abs(a); #else return Kokkos::abs(a); @@ -95,11 +95,7 @@ class absolutes { template KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { if constexpr (std::is_signed_v) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - return Kokkos::Experimental::abs(a); -#else return Kokkos::abs(a); -#endif } return a; } diff --git a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 230245a4e58..4feff3a89d2 100644 --- a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -36,20 +36,33 @@ inline void host_check_gen_ctor() { expected[i] = (init_mask[i]) ? init[i] * 9 : init[i]; } - simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); - mask_type mask(KOKKOS_LAMBDA(std::size_t i) { return init_mask[i]; }); - simd_type rhs; rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); - host_check_equality(basic, rhs, lanes); - - simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); - simd_type result( - KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); simd_type blend; blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); - host_check_equality(blend, result, lanes); + + if constexpr (std::is_same_v) { + simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); + host_check_equality(basic, rhs, lanes); + + simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); + mask_type mask(KOKKOS_LAMBDA(std::size_t i) { return init_mask[i]; }); + simd_type result( + KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + + host_check_equality(blend, result, lanes); + } else { + simd_type basic([=](std::size_t i) { return init[i]; }); + host_check_equality(basic, rhs, lanes); + + simd_type lhs([=](std::size_t i) { return init[i] * 9; }); + mask_type mask([=](std::size_t i) { return init_mask[i]; }); + simd_type result( + [=](std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + + host_check_equality(blend, result, lanes); + } } template From 4d3958becf9960800e2dcb44b94c7508a877dbfd Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 5 Oct 2023 11:10:41 -0600 Subject: [PATCH 037/432] guards to ensure DBL_EPSILON return for POWER8,9 --- core/unit_test/TestMathematicalFunctions.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index bb78432c659..2e0848f37bb 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -247,9 +247,15 @@ struct FloatingPointComparison { #endif KOKKOS_FUNCTION double eps(float) const { return FLT_EPSILON; } +// POWER9 gives unexpected values with LDBL_EPSILON issues +// https://stackoverflow.com/questions/68960416/ppc64-long-doubles-machine-epsilon-calculation +#if defined(KOKKOS_ARCH_POWER9) || defined(KOKKOS_ARCH_POWER8) + KOKKOS_FUNCTION + double eps(long double) const { return DBL_EPSILON; } +#else KOKKOS_FUNCTION double eps(long double) const { return LDBL_EPSILON; } - +#endif // Using absolute here instead of abs, since we actually test abs ... template KOKKOS_FUNCTION std::enable_if_t::value, T> absolute( From 890148e5ddffff1bd2c50ec8d7a0b1a32dbacc21 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 5 Oct 2023 16:08:58 -0400 Subject: [PATCH 038/432] Fix NVCC warnings (#6483) * Fix NVCC warnings * Use ternary instead of Kokkos::min --- .../src/sorting/Kokkos_BinSortPublicAPI.hpp | 7 +++--- .../unit_tests/TestStdAlgorithmsCommon.hpp | 3 ++- core/src/Kokkos_Array.hpp | 8 +++--- core/src/Kokkos_BitManipulation.hpp | 25 +++++++------------ core/unit_test/TestMathematicalFunctions.hpp | 3 ++- 5 files changed, 20 insertions(+), 26 deletions(-) diff --git a/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index 50daf30803e..c399279fe48 100644 --- a/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -387,10 +387,9 @@ class BinSort { // Switching to std::sort for more than 10 elements has been found // reasonable experimentally. if (use_std_sort && bin_size > 10) { - if constexpr (use_std_sort) { - std::sort(&sort_order(lower_bound), &sort_order(upper_bound), - [this](int p, int q) { return bin_op(keys_rnd, p, q); }); - } + KOKKOS_IF_ON_HOST( + (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { int old_idx = sort_order(k); diff --git a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 0e27917f3b5..b962218b5f0 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -198,7 +198,8 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) { // this is needed for intel to avoid // error #1011: missing return statement at end of non-void function -#if defined KOKKOS_COMPILER_INTEL +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) __builtin_unreachable(); #endif } diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 599d4bec21a..82ceaaec218 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -234,14 +234,14 @@ struct Array::contiguous> { KOKKOS_INLINE_FUNCTION Array& operator=(const Array& rhs) { - const size_t n = std::min(m_size, rhs.size()); + const size_t n = size() < rhs.size() ? size() : rhs.size(); for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; return *this; } template KOKKOS_INLINE_FUNCTION Array& operator=(const Array& rhs) { - const size_t n = std::min(m_size, rhs.size()); + const size_t n = size() < rhs.size() ? size() : rhs.size(); for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; return *this; } @@ -303,14 +303,14 @@ struct Array::strided> { KOKKOS_INLINE_FUNCTION Array& operator=(const Array& rhs) { - const size_t n = std::min(m_size, rhs.size()); + const size_t n = size() < rhs.size() ? size() : rhs.size(); for (size_t i = 0; i < n; ++i) m_elem[i * m_stride] = rhs[i]; return *this; } template KOKKOS_INLINE_FUNCTION Array& operator=(const Array& rhs) { - const size_t n = std::min(m_size, rhs.size()); + const size_t n = size() < rhs.size() ? size() : rhs.size(); for (size_t i = 0; i < n; ++i) m_elem[i * m_stride] = rhs[i]; return *this; } diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index fbc7304365f..f3baf71660e 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -287,15 +287,13 @@ KOKKOS_IMPL_DEVICE_FUNCTION std::enable_if_t, int> countl_zero_builtin_device(T x) noexcept { #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - if constexpr (sizeof(T) == sizeof(long long int)) { + if constexpr (sizeof(T) == sizeof(long long int)) return __clzll(reinterpret_cast(x)); - } else if constexpr (sizeof(T) == sizeof(int)) { + if constexpr (sizeof(T) == sizeof(int)) return __clz(reinterpret_cast(x)); - } else { - using ::Kokkos::Experimental::digits_v; - constexpr int shift = digits_v - digits_v; - return __clz(x) - shift; - } + using ::Kokkos::Experimental::digits_v; + constexpr int shift = digits_v - digits_v; + return __clz(x) - shift; #elif defined(KOKKOS_ENABLE_SYCL) return sycl::clz(x); #else @@ -332,11 +330,9 @@ KOKKOS_IMPL_DEVICE_FUNCTION using ::Kokkos::Experimental::digits_v; if (x == 0) return digits_v; #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - if constexpr (sizeof(T) == sizeof(long long int)) { + if constexpr (sizeof(T) == sizeof(long long int)) return __ffsll(reinterpret_cast(x)) - 1; - } else { - return __ffs(reinterpret_cast(x)) - 1; - } + return __ffs(reinterpret_cast(x)) - 1; #elif defined(KOKKOS_ENABLE_SYCL) return sycl::ctz(x); #else @@ -368,11 +364,8 @@ KOKKOS_IMPL_DEVICE_FUNCTION std::enable_if_t, int> popcount_builtin_device(T x) noexcept { #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - if constexpr (sizeof(T) == sizeof(long long int)) { - return __popcll(x); - } else { - return __popc(x); - } + if constexpr (sizeof(T) == sizeof(long long int)) return __popcll(x); + return __popc(x); #elif defined(KOKKOS_ENABLE_SYCL) return sycl::popcount(x); #else diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index bb78432c659..e4a75a9a1db 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -30,7 +30,8 @@ #define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS #endif -#if defined KOKKOS_COMPILER_INTEL +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable(); #else #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE From e40f026efa959a9e77a08424cfc783104582f2c0 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 5 Oct 2023 22:12:07 +0200 Subject: [PATCH 039/432] team-level std algos: part 13 (#6351) Add team level algorithms for inclusive scan --- .../std_algorithms/Kokkos_InclusiveScan.hpp | 311 +++++++++++++----- .../Kokkos_TransformInclusiveScan.hpp | 244 ++++++++++---- .../impl/Kokkos_InclusiveScan.hpp | 144 +++++++- .../impl/Kokkos_TransformInclusiveScan.hpp | 212 ++++++++++-- algorithms/unit_tests/CMakeLists.txt | 14 +- .../TestStdAlgorithmsTeamInclusiveScan.cpp | 277 ++++++++++++++++ ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 264 +++++++++++++++ ...estStdAlgorithmsTransformInclusiveScan.cpp | 14 +- 8 files changed, 1299 insertions(+), 181 deletions(-) create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp diff --git a/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp index bcd731b850a..a0e540b5e7a 100644 --- a/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp @@ -23,33 +23,45 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest) { - return Impl::inclusive_scan_default_op_impl( +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest) { + return Impl::inclusive_scan_default_op_exespace_impl( "Kokkos::inclusive_scan_default_functors_iterator_api", ex, first, last, first_dest); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest) { - return Impl::inclusive_scan_default_op_impl(label, ex, first, last, - first_dest); +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest) { + return Impl::inclusive_scan_default_op_exespace_impl(label, ex, first, last, + first_dest); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -57,13 +69,15 @@ auto inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_default_op_impl( + return Impl::inclusive_scan_default_op_exespace_impl( "Kokkos::inclusive_scan_default_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -71,39 +85,45 @@ auto inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), - KE::begin(view_dest)); + return Impl::inclusive_scan_default_op_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest)); } // overload set 2 (accepting custom binary op) -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOp binary_op) { - return Impl::inclusive_scan_custom_binary_op_impl( +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOp binary_op) { + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last, first_dest, binary_op); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOp binary_op) { - return Impl::inclusive_scan_custom_binary_op_impl(label, ex, first, last, - first_dest, binary_op); +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, BinaryOp binary_op) { + return Impl::inclusive_scan_custom_binary_op_exespace_impl( + label, ex, first, last, first_dest, binary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -111,14 +131,16 @@ auto inclusive_scan(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -126,67 +148,192 @@ auto inclusive_scan(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op); } // overload set 3 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOp binary_op, ValueType init_value) { - return Impl::inclusive_scan_custom_binary_op_impl( +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOp binary_op, ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last, - first_dest, binary_op, init_value); + first_dest, binary_op, std::move(init_value)); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOp binary_op, - ValueType init_value) { - return Impl::inclusive_scan_custom_binary_op_impl( - label, ex, first, last, first_dest, binary_op, init_value); +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOp binary_op, ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::inclusive_scan_custom_binary_op_exespace_impl( + label, ex, first, last, first_dest, binary_op, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOp binary_op, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - binary_op, init_value); + binary_op, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOp binary_op, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), binary_op, init_value); + KE::begin(view_dest), binary_op, std::move(init_value)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1 +template && :: + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType +inclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest) { + return Impl::inclusive_scan_default_op_team_impl(teamHandle, first, last, + first_dest); +} + +template , int> = 0> +KOKKOS_FUNCTION auto inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::inclusive_scan_default_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest)); +} + +// overload set 2 (accepting custom binary op) +template && :: + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType inclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, BinaryOp binary_op) { + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, first, last, first_dest, binary_op); +} + +template , int> = 0> +KOKKOS_FUNCTION auto inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOp binary_op) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op); +} + +// overload set 3 +template && :: + Kokkos::is_team_handle_v, + int> = 0> + +KOKKOS_FUNCTION OutputIteratorType +inclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOp binary_op, ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, first, last, first_dest, binary_op, std::move(init_value)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOp binary_op, ValueType init_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + namespace KE = ::Kokkos::Experimental; + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op, std::move(init_value)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp index 7489af7e379..5f694dbfd98 100644 --- a/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp @@ -23,40 +23,53 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1 (no init value) -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOpType binary_op, UnaryOpType unary_op) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOpType binary_op, + UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex, first, last, first_dest, binary_op, unary_op); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOpType binary_op, - UnaryOpType unary_op) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest, - binary_op, unary_op); + return Impl::transform_inclusive_scan_exespace_impl( + label, ex, first, last, first_dest, binary_op, unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -66,15 +79,17 @@ auto transform_inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op, unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -84,46 +99,59 @@ auto transform_inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op, unary_op); } // overload set 2 (init value) -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOpType binary_op, UnaryOpType unary_op, - ValueType init_value) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan( + const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, + OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op, + ValueType init_value) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl( + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex, - first, last, first_dest, binary_op, unary_op, init_value); + first, last, first_dest, binary_op, unary_op, std::move(init_value)); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOpType binary_op, - UnaryOpType unary_op, ValueType init_value) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest, - binary_op, unary_op, init_value); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::transform_inclusive_scan_exespace_impl( + label, ex, first, last, first_dest, binary_op, unary_op, + std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -132,16 +160,21 @@ auto transform_inclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - binary_op, unary_op, init_value); + binary_op, unary_op, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -150,10 +183,97 @@ auto transform_inclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), binary_op, unary_op, init_value); + KE::begin(view_dest), binary_op, unary_op, std::move(init_value)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1 (no init value) +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + + return Impl::transform_inclusive_scan_team_impl( + teamHandle, first, last, first_dest, binary_op, unary_op); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform_inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::transform_inclusive_scan_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op, unary_op); +} + +// overload set 2 (init value) +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { + Impl::static_assert_is_not_openmptarget(teamHandle); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::transform_inclusive_scan_team_impl( + teamHandle, first, last, first_dest, binary_op, unary_op, + std::move(init_value)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform_inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + namespace KE = ::Kokkos::Experimental; + return Impl::transform_inclusive_scan_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op, unary_op, std::move(init_value)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp index ecd6ff39cd5..0b4acec0feb 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp @@ -101,9 +101,12 @@ struct InclusiveScanDefaultFunctor { } }; +// +// exespace impl +// template -OutputIteratorType inclusive_scan_default_op_impl( +OutputIteratorType inclusive_scan_default_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest) { @@ -143,7 +146,7 @@ OutputIteratorType inclusive_scan_default_op_impl( // ------------------------------------------------------------- template -OutputIteratorType inclusive_scan_custom_binary_op_impl( +OutputIteratorType inclusive_scan_custom_binary_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest, BinaryOpType binary_op) { @@ -158,7 +161,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( using value_type = std::remove_const_t; using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; - using func_type = TransformInclusiveScanNoInitValueFunctor< + using func_type = ExeSpaceTransformInclusiveScanNoInitValueFunctor< ExecutionSpace, index_type, value_type, InputIteratorType, OutputIteratorType, BinaryOpType, unary_op_type>; @@ -179,7 +182,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( // ------------------------------------------------------------- template -OutputIteratorType inclusive_scan_custom_binary_op_impl( +OutputIteratorType inclusive_scan_custom_binary_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest, BinaryOpType binary_op, @@ -193,7 +196,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( // aliases using index_type = typename InputIteratorType::difference_type; using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; - using func_type = TransformInclusiveScanWithInitValueFunctor< + using func_type = ExeSpaceTransformInclusiveScanWithInitValueFunctor< ExecutionSpace, index_type, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType, unary_op_type>; @@ -203,13 +206,142 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( ::Kokkos::parallel_scan(label, RangePolicy(ex, 0, num_elements), func_type(first_from, first_dest, binary_op, - unary_op_type(), init_value)); + unary_op_type(), std::move(init_value))); ex.fence("Kokkos::inclusive_scan_custom_binary_op: fence after operation"); // return return first_dest + num_elements; } +// +// team impl +// +template +KOKKOS_FUNCTION OutputIteratorType inclusive_scan_default_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + using value_type = + std::remove_const_t; + + // #if defined(KOKKOS_ENABLE_CUDA) + + using exe_space = typename TeamHandleType::execution_space; + using index_type = typename InputIteratorType::difference_type; + using func_type = std::conditional_t< + ::Kokkos::is_detected::value, + InclusiveScanDefaultFunctorForKnownIdentityElement< + exe_space, index_type, value_type, InputIteratorType, + OutputIteratorType>, + InclusiveScanDefaultFunctor>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest)); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + +// ------------------------------------------------------------- +// inclusive_scan_custom_binary_op_impl +// ------------------------------------------------------------- +template +KOKKOS_FUNCTION OutputIteratorType inclusive_scan_custom_binary_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + BinaryOpType binary_op) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + using value_type = + std::remove_const_t; + + static_assert( + ::Kokkos::is_detected_v, + "At the moment inclusive_scan doesn't support types without reduction " + "identity"); + + // #if defined(KOKKOS_ENABLE_CUDA) + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using func_type = TeamTransformInclusiveScanNoInitValueFunctor< + exe_space, value_type, InputIteratorType, OutputIteratorType, + BinaryOpType, unary_op_type>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + + ::Kokkos::parallel_scan( + TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest, binary_op, unary_op_type())); + teamHandle.team_barrier(); + + return first_dest + num_elements; +} + +// ------------------------------------------------------------- +// inclusive_scan_custom_binary_op_impl with init_value +// ------------------------------------------------------------- +template +KOKKOS_FUNCTION OutputIteratorType inclusive_scan_custom_binary_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + BinaryOpType binary_op, ValueType init_value) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + static_assert( + ::Kokkos::is_detected_v, + "At the moment inclusive_scan doesn't support types without reduction " + "identity"); + + // #if defined(KOKKOS_ENABLE_CUDA) + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using func_type = TeamTransformInclusiveScanWithInitValueFunctor< + exe_space, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType, + unary_op_type>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest, binary_op, + unary_op_type(), std::move(init_value))); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp index 05f8589086f..dc432573ee3 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp @@ -31,7 +31,7 @@ namespace Impl { template -struct TransformInclusiveScanNoInitValueFunctor { +struct ExeSpaceTransformInclusiveScanNoInitValueFunctor { using execution_space = ExeSpace; using value_type = ValueWrapperForNoNeutralElement; @@ -41,9 +41,10 @@ struct TransformInclusiveScanNoInitValueFunctor { UnaryOpType m_unary_op; KOKKOS_FUNCTION - TransformInclusiveScanNoInitValueFunctor(FirstFrom first_from, - FirstDest first_dest, - BinaryOpType bop, UnaryOpType uop) + ExeSpaceTransformInclusiveScanNoInitValueFunctor(FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop) : m_first_from(std::move(first_from)), m_first_dest(std::move(first_dest)), m_binary_op(std::move(bop)), @@ -80,7 +81,7 @@ struct TransformInclusiveScanNoInitValueFunctor { template -struct TransformInclusiveScanWithInitValueFunctor { +struct ExeSpaceTransformInclusiveScanWithInitValueFunctor { using execution_space = ExeSpace; using value_type = ValueWrapperForNoNeutralElement; @@ -91,10 +92,11 @@ struct TransformInclusiveScanWithInitValueFunctor { ValueType m_init; KOKKOS_FUNCTION - TransformInclusiveScanWithInitValueFunctor(FirstFrom first_from, - FirstDest first_dest, - BinaryOpType bop, UnaryOpType uop, - ValueType init) + ExeSpaceTransformInclusiveScanWithInitValueFunctor(FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop, + ValueType init) : m_first_from(std::move(first_from)), m_first_dest(std::move(first_dest)), m_binary_op(std::move(bop)), @@ -131,18 +133,20 @@ struct TransformInclusiveScanWithInitValueFunctor { } }; +// +// exespace impl +// + // ------------------------------------------------------------- -// transform_inclusive_scan_impl without init_value +// transform_inclusive_scan_exespace_impl without init_value // ------------------------------------------------------------- template -OutputIteratorType transform_inclusive_scan_impl(const std::string& label, - const ExecutionSpace& ex, - InputIteratorType first_from, - InputIteratorType last_from, - OutputIteratorType first_dest, - BinaryOpType binary_op, - UnaryOpType unary_op) { +OutputIteratorType transform_inclusive_scan_exespace_impl( + const std::string& label, const ExecutionSpace& ex, + InputIteratorType first_from, InputIteratorType last_from, + OutputIteratorType first_dest, BinaryOpType binary_op, + UnaryOpType unary_op) { // checks Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); Impl::static_assert_iterators_have_matching_difference_type(first_from, @@ -153,7 +157,7 @@ OutputIteratorType transform_inclusive_scan_impl(const std::string& label, using index_type = typename InputIteratorType::difference_type; using value_type = std::remove_const_t; - using func_type = TransformInclusiveScanNoInitValueFunctor< + using func_type = ExeSpaceTransformInclusiveScanNoInitValueFunctor< ExecutionSpace, index_type, value_type, InputIteratorType, OutputIteratorType, BinaryOpType, UnaryOpType>; @@ -170,12 +174,12 @@ OutputIteratorType transform_inclusive_scan_impl(const std::string& label, } // ------------------------------------------------------------- -// transform_inclusive_scan_impl with init_value +// transform_inclusive_scan_exespace_impl with init_value // ------------------------------------------------------------- template -OutputIteratorType transform_inclusive_scan_impl( +OutputIteratorType transform_inclusive_scan_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op, @@ -188,22 +192,182 @@ OutputIteratorType transform_inclusive_scan_impl( // aliases using index_type = typename InputIteratorType::difference_type; - using func_type = TransformInclusiveScanWithInitValueFunctor< + using func_type = ExeSpaceTransformInclusiveScanWithInitValueFunctor< ExecutionSpace, index_type, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType, UnaryOpType>; // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); - ::Kokkos::parallel_scan( - label, RangePolicy(ex, 0, num_elements), - func_type(first_from, first_dest, binary_op, unary_op, init_value)); + ::Kokkos::parallel_scan(label, + RangePolicy(ex, 0, num_elements), + func_type(first_from, first_dest, binary_op, unary_op, + std::move(init_value))); ex.fence("Kokkos::transform_inclusive_scan: fence after operation"); // return return first_dest + num_elements; } +// +// team impl +// + +template +struct TeamTransformInclusiveScanNoInitValueFunctor { + using execution_space = ExeSpace; + using index_type = typename FirstFrom::difference_type; + + FirstFrom m_first_from; + FirstDest m_first_dest; + BinaryOpType m_binary_op; + UnaryOpType m_unary_op; + + KOKKOS_FUNCTION + TeamTransformInclusiveScanNoInitValueFunctor(FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop) + : m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)), + m_binary_op(std::move(bop)), + m_unary_op(std::move(uop)) {} + + KOKKOS_FUNCTION + void operator()(const index_type i, ValueType& update, + const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; + this->join(update, tmp); + if (final_pass) { + m_first_dest[i] = update; + } + } + + KOKKOS_FUNCTION + void init(ValueType& update) const { update = {}; } + + KOKKOS_FUNCTION + void join(ValueType& update, const ValueType& input) const { + update = m_binary_op(update, input); + } +}; + +template +struct TeamTransformInclusiveScanWithInitValueFunctor { + using execution_space = ExeSpace; + using index_type = typename FirstFrom::difference_type; + + FirstFrom m_first_from; + FirstDest m_first_dest; + BinaryOpType m_binary_op; + UnaryOpType m_unary_op; + ValueType m_init; + + KOKKOS_FUNCTION + TeamTransformInclusiveScanWithInitValueFunctor(FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop, + ValueType init) + : m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)), + m_binary_op(std::move(bop)), + m_unary_op(std::move(uop)), + m_init(std::move(init)) {} + + KOKKOS_FUNCTION + void operator()(const index_type i, ValueType& update, + const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; + this->join(update, tmp); + + if (final_pass) { + m_first_dest[i] = m_binary_op(update, m_init); + } + } + + KOKKOS_FUNCTION + void init(ValueType& update) const { update = {}; } + + KOKKOS_FUNCTION + void join(ValueType& update, const ValueType& input) const { + update = m_binary_op(update, input); + } +}; + +// ------------------------------------------------------------- +// transform_inclusive_scan_team_impl without init_value +// ------------------------------------------------------------- +template +KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using value_type = + std::remove_const_t; + using func_type = TeamTransformInclusiveScanNoInitValueFunctor< + exe_space, value_type, InputIteratorType, OutputIteratorType, + BinaryOpType, UnaryOpType>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan( + TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest, binary_op, unary_op)); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + +// ------------------------------------------------------------- +// transform_inclusive_scan_team_impl with init_value +// ------------------------------------------------------------- +template +KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using func_type = TeamTransformInclusiveScanWithInitValueFunctor< + exe_space, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType, + UnaryOpType>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest, binary_op, unary_op, + std::move(init_value))); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt index cf5a0e7f68d..d866b4e250c 100644 --- a/algorithms/unit_tests/CMakeLists.txt +++ b/algorithms/unit_tests/CMakeLists.txt @@ -153,6 +153,18 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) endforeach() + # ------------------------------------------ + # std team Q + # ------------------------------------------ + set(STDALGO_TEAM_SOURCES_Q) + foreach(Name + StdAlgorithmsCommon + StdAlgorithmsTeamInclusiveScan + StdAlgorithmsTeamTransformInclusiveScan + ) + list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) + endforeach() + # ------------------------------------------ # std team P # ------------------------------------------ @@ -431,7 +443,7 @@ foreach(ID A;B;C;D;E) ) endforeach() -foreach(ID A;B;C;D;E;F;G;H;I;L;M;P) +foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q) KOKKOS_ADD_EXECUTABLE_AND_TEST( AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp new file mode 100644 index 00000000000..0daf9dbfe82 --- /dev/null +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -0,0 +1,277 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "std_algorithms/Kokkos_BeginEnd.hpp" + +namespace Test { +namespace stdalgos { +namespace TeamInclusiveScan { + +namespace KE = Kokkos::Experimental; + +template +struct PlusFunctor { + KOKKOS_INLINE_FUNCTION constexpr ValueType operator()( + const ValueType& lhs, const ValueType& rhs) const { + return lhs + rhs; + } +}; + +template +struct TestFunctorA { + SourceViewType m_sourceView; + DestViewType m_destView; + DistancesViewType m_distancesView; + IntraTeamSentinelView m_intraTeamSentinelView; + InitValuesViewType m_initValuesView; + BinaryOpType m_binaryOp; + int m_apiPick; + + TestFunctorA(const SourceViewType sourceView, const DestViewType destView, + const DistancesViewType distancesView, + const IntraTeamSentinelView intraTeamSentinelView, + const InitValuesViewType initValuesView, BinaryOpType binaryOp, + int apiPick) + : m_sourceView(sourceView), + m_destView(destView), + m_distancesView(distancesView), + m_intraTeamSentinelView(intraTeamSentinelView), + m_initValuesView(initValuesView), + m_binaryOp(binaryOp), + m_apiPick(apiPick) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const { + const auto rowIndex = member.league_rank(); + + auto srcRow = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL()); + const auto first = KE::cbegin(srcRow); + const auto last = KE::cend(srcRow); + auto destRow = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL()); + const auto firstDest = KE::begin(destRow); + + const auto initVal = m_initValuesView(rowIndex); + ptrdiff_t resultDist = 0; + + switch (m_apiPick) { + case 0: { + auto it = KE::inclusive_scan(member, first, last, firstDest); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 1: { + auto it = KE::inclusive_scan(member, srcRow, destRow); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 2: { + auto it = + KE::inclusive_scan(member, first, last, firstDest, m_binaryOp); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 3: { + auto it = KE::inclusive_scan(member, srcRow, destRow, m_binaryOp); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 4: { + auto it = KE::inclusive_scan(member, first, last, firstDest, m_binaryOp, + initVal); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 5: { + auto it = + KE::inclusive_scan(member, srcRow, destRow, m_binaryOp, initVal); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + } + + // store result of checking if all members have their local + // values matching the one stored in m_distancesView + member.team_barrier(); + const bool intraTeamCheck = team_members_have_matching_result( + member, resultDist, m_distancesView(rowIndex)); + Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { + m_intraTeamSentinelView(rowIndex) = intraTeamCheck; + }); + } +}; + +template +void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { + /* description: + use a rank-2 view randomly filled with values, + and run a team-level inclusive_scan + */ + + // ----------------------------------------------- + // prepare data + // ----------------------------------------------- + // create a view in the memory space associated with default exespace + // with as many rows as the number of teams and fill it with random + // values from an arbitrary range. + constexpr ValueType lowerBound = 5; + constexpr ValueType upperBound = 523; + const auto bounds = make_bounds(lowerBound, upperBound); + + auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone( + LayoutTag{}, numTeams, numCols, bounds, "sourceView"); + + // ----------------------------------------------- + // launch kokkos kernel + // ----------------------------------------------- + using space_t = Kokkos::DefaultExecutionSpace; + Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + + // inclusive_scan returns an iterator so to verify that it is correct + // each team stores the distance of the returned iterator from the beginning + // of the interval that team operates on and then we check that these + // distances match the std result + Kokkos::View distancesView("distancesView", numTeams); + // sentinel to check if all members of the team compute the same result + Kokkos::View intraTeamSentinelView("intraTeamSameResult", numTeams); + + PlusFunctor binaryOp; + + // Create view of reduce init values to be used by test cases + Kokkos::View initValuesView_h( + "initValuesView_h", numTeams); + using rand_pool = + Kokkos::Random_XorShift64_Pool; + rand_pool pool(lowerBound * upperBound); + Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); + + // use CTAD for functor + auto initValuesView = + Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + + // ----------------------------------------------- + // run cpp-std kernel and check + // ----------------------------------------------- + auto distancesView_h = create_host_space_copy(distancesView); + auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); + Kokkos::View stdDestView("stdDestView", + numTeams, numCols); + + for (std::size_t i = 0; i < sourceView.extent(0); ++i) { + auto srcRow = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); + auto first = KE::begin(srcRow); + auto last = KE::end(srcRow); + auto destRow = Kokkos::subview(stdDestView, i, Kokkos::ALL()); + auto firstDest = KE::begin(destRow); + auto initValue = initValuesView_h(i); + + ASSERT_TRUE(intraTeamSentinelView_h(i)); + +// libstdc++ as provided by GCC 8 does not have inclusive_scan and +// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy +#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9) +#define inclusive_scan testing_inclusive_scan +#else +#define inclusive_scan std::inclusive_scan +#endif + + switch (apiId) { + case 0: + case 1: { + auto it = inclusive_scan(first, last, firstDest); + const std::size_t stdDistance = KE::distance(firstDest, it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + + break; + } + + case 2: + case 3: { + auto it = inclusive_scan(first, last, firstDest, binaryOp); + const std::size_t stdDistance = KE::distance(firstDest, it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + + break; + } + + case 4: + case 5: { + auto it = inclusive_scan(first, last, firstDest, binaryOp, initValue); + const std::size_t stdDistance = KE::distance(firstDest, it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + + break; + } + } + +#undef inclusive_scan + } + + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); +} + +template +void run_all_scenarios() { + for (int numTeams : teamSizesToTest) { + for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { + for (int apiId : {0, 1, 2, 3, 4, 5}) { + test_A(numTeams, numCols, apiId); + } + } + } +} + +TEST(std_algorithms_inclusive_scan_team_test, test) { + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); +} + +} // namespace TeamInclusiveScan +} // namespace stdalgos +} // namespace Test diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp new file mode 100644 index 00000000000..4b316602326 --- /dev/null +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -0,0 +1,264 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#if not defined KOKKOS_ENABLE_OPENMPTARGET + +namespace Test { +namespace stdalgos { +namespace TeamTransformInclusiveScan { + +namespace KE = Kokkos::Experimental; + +template +struct PlusFunctor { + KOKKOS_INLINE_FUNCTION + ValueType operator()(const ValueType& lhs, const ValueType& rhs) const { + return lhs + rhs; + } +}; + +template +struct MultipliesByTwoFunctor { + KOKKOS_INLINE_FUNCTION + ValueType operator()(const ValueType& value) const { return value * 2; } +}; + +template +struct TestFunctorA { + SourceViewType m_sourceView; + DestViewType m_destView; + DistancesViewType m_distancesView; + IntraTeamSentinelView m_intraTeamSentinelView; + InitValuesViewType m_initValuesView; + BinaryOpType m_binaryOp; + UnaryOpType m_unaryOp; + int m_apiPick; + + TestFunctorA(const SourceViewType sourceView, const DestViewType destView, + const DistancesViewType distancesView, + const IntraTeamSentinelView intraTeamSentinelView, + const InitValuesViewType initValuesView, BinaryOpType binaryOp, + UnaryOpType unaryOp, int apiPick) + : m_sourceView(sourceView), + m_destView(destView), + m_distancesView(distancesView), + m_intraTeamSentinelView(intraTeamSentinelView), + m_initValuesView(initValuesView), + m_binaryOp(binaryOp), + m_unaryOp(unaryOp), + m_apiPick(apiPick) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType& member) const { + const auto rowIndex = member.league_rank(); + + auto srcRow = Kokkos::subview(m_sourceView, rowIndex, Kokkos::ALL()); + const auto first = KE::cbegin(srcRow); + const auto last = KE::cend(srcRow); + auto destRow = Kokkos::subview(m_destView, rowIndex, Kokkos::ALL()); + auto firstDest = KE::begin(destRow); + + const auto initVal = m_initValuesView(rowIndex); + ptrdiff_t resultDist = 0; + + switch (m_apiPick) { + case 0: { + auto it = KE::transform_inclusive_scan(member, first, last, firstDest, + m_binaryOp, m_unaryOp); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 1: { + auto it = KE::transform_inclusive_scan(member, srcRow, destRow, + m_binaryOp, m_unaryOp); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 2: { + auto it = KE::transform_inclusive_scan(member, first, last, firstDest, + m_binaryOp, m_unaryOp, initVal); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + + case 3: { + auto it = KE::transform_inclusive_scan(member, srcRow, destRow, + m_binaryOp, m_unaryOp, initVal); + resultDist = KE::distance(firstDest, it); + Kokkos::single(Kokkos::PerTeam(member), + [=, *this] { m_distancesView(rowIndex) = resultDist; }); + + break; + } + } + + // store result of checking if all members have their local + // values matching the one stored in m_distancesView + member.team_barrier(); + const bool intraTeamCheck = team_members_have_matching_result( + member, resultDist, m_distancesView(rowIndex)); + Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { + m_intraTeamSentinelView(rowIndex) = intraTeamCheck; + }); + } +}; + +template +void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { + /* description: + use a rank-2 view randomly filled with values, + and run a team-level transform_inclusive_scan + */ + + // ----------------------------------------------- + // prepare data + // ----------------------------------------------- + // create a view in the memory space associated with default exespace + // with as many rows as the number of teams and fill it with random + // values from an arbitrary range. + constexpr ValueType lowerBound = 5; + constexpr ValueType upperBound = 523; + const auto bounds = make_bounds(lowerBound, upperBound); + + auto [sourceView, sourceViewBeforeOp_h] = create_random_view_and_host_clone( + LayoutTag{}, numTeams, numCols, bounds, "sourceView"); + + // ----------------------------------------------- + // launch kokkos kernel + // ----------------------------------------------- + using space_t = Kokkos::DefaultExecutionSpace; + Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + + // tranform_inclusive_scan returns an iterator so to verify that it is correct + // each team stores the distance of the returned iterator from the beginning + // of the interval that team operates on and then we check that these + // distances match the std result + Kokkos::View distancesView("distancesView", numTeams); + // sentinel to check if all members of the team compute the same result + Kokkos::View intraTeamSentinelView("intraTeamSameResult", numTeams); + + PlusFunctor binaryOp; + MultipliesByTwoFunctor unaryOp; + + // Create view of reduce init values to be used by test cases + Kokkos::View initValuesView_h( + "initValuesView_h", numTeams); + using rand_pool = + Kokkos::Random_XorShift64_Pool; + rand_pool pool(lowerBound * upperBound); + Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); + + // use CTAD for functor + auto initValuesView = + Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + + // ----------------------------------------------- + // run cpp-std kernel and check + // ----------------------------------------------- + auto distancesView_h = create_host_space_copy(distancesView); + auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); + Kokkos::View stdDestView("stdDestView", + numTeams, numCols); + + for (std::size_t i = 0; i < sourceView.extent(0); ++i) { + auto srcRow = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); + auto first = KE::cbegin(srcRow); + auto last = KE::cend(srcRow); + auto destRow = Kokkos::subview(stdDestView, i, Kokkos::ALL()); + auto firstDest = KE::begin(destRow); + auto initValue = initValuesView_h(i); + + ASSERT_TRUE(intraTeamSentinelView_h(i)); + +// libstdc++ as provided by GCC 8 does not have transform_inclusive_scan and +// for GCC 9.1, 9.2 fails to compile for missing overload not accepting policy +#if defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE <= 9) +#define transform_inclusive_scan testing_transform_inclusive_scan +#else +#define transform_inclusive_scan std::transform_inclusive_scan +#endif + + switch (apiId) { + case 0: + case 1: { + const auto it = + transform_inclusive_scan(first, last, firstDest, binaryOp, unaryOp); + const std::size_t stdDistance = KE::distance(firstDest, it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + + break; + } + + case 2: + case 3: { + const auto it = transform_inclusive_scan(first, last, firstDest, + binaryOp, unaryOp, initValue); + const std::size_t stdDistance = KE::distance(firstDest, it); + ASSERT_EQ(stdDistance, distancesView_h(i)); + + break; + } + } + } +#undef transform_inclusive_scan + + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); +} + +template +void run_all_scenarios() { + for (int numTeams : teamSizesToTest) { + for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { + for (int apiId : {0, 1, 2, 3}) { + test_A(numTeams, numCols, apiId); + } + } + } +} + +TEST(std_algorithms_transform_inclusive_scan_team_test, test) { + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); +} + +} // namespace TeamTransformInclusiveScan +} // namespace stdalgos +} // namespace Test + +#endif diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index 5d122ac5e89..a90a68ca1d7 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -355,18 +355,20 @@ TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan_functor) { using unary_op_type = KE::Impl::StdNumericScanIdentityReferenceUnaryFunctor; { - using functor_type = KE::Impl::TransformInclusiveScanNoInitValueFunctor< - exespace, int, int, view_type, view_type, MultiplyFunctor, - unary_op_type>; + using functor_type = + KE::Impl::ExeSpaceTransformInclusiveScanNoInitValueFunctor< + exespace, int, int, view_type, view_type, MultiplyFunctor, + unary_op_type>; functor_type functor(dummy_view, dummy_view, {}, {}); test_lambda(functor); } { - using functor_type = KE::Impl::TransformInclusiveScanWithInitValueFunctor< - exespace, int, int, view_type, view_type, MultiplyFunctor, - unary_op_type>; + using functor_type = + KE::Impl::ExeSpaceTransformInclusiveScanWithInitValueFunctor< + exespace, int, int, view_type, view_type, MultiplyFunctor, + unary_op_type>; functor_type functor(dummy_view, dummy_view, {}, {}, dummy); test_lambda(functor); From 1ccf4995b121be643aa0f021c946f334a271eb5f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 5 Oct 2023 16:25:38 -0400 Subject: [PATCH 040/432] Also fix annotations for generator constructor for AVX512 and NEON --- simd/src/Kokkos_SIMD_AVX512.hpp | 2 +- simd/src/Kokkos_SIMD_NEON.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index 75b62b94d63..66b922b2271 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -1064,7 +1064,7 @@ class simd> { std::is_invocable_r_v>, bool> = false> - KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) : m_value(_mm256_setr_ps(gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index a872fa71c9c..c3aca998b11 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -550,7 +550,7 @@ class simd> { std::is_invocable_r_v>, bool> = false> - KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen) { + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) { m_value = vset_lane_f32(gen(std::integral_constant()), m_value, 0); m_value = vset_lane_f32(gen(std::integral_constant()), From c5bf8705d1152f3b30fd044f9d4443843984a001 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 25 Jul 2023 15:34:51 +0200 Subject: [PATCH 041/432] #5635: SYCL: Add parallel_scan overload with value for ThreadVectorRange --- core/src/SYCL/Kokkos_SYCL_Team.hpp | 29 ++++++++++++++++++++++++++++- core/unit_test/TestTeamVector.hpp | 8 ++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index a6be78d5e96..89c09c3195f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -825,7 +825,7 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< // This sets i's val to i-1's contribution to make the latter shfl_up an // exclusive scan -- the final accumulation of i's val will be included in // the second closure call later. - if (i < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false); + if (i - 1 < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false); // Bottom up exclusive scan in triangular pattern where each SYCL thread is // the root of a reduction tree from the zeroth "lane" to itself. @@ -847,6 +847,7 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< if (i < loop_boundaries.end) closure(i, val, true); accum = sg.shuffle(val, mask + vector_offset); } + reducer.reference() = accum; } /** \brief Intra-thread vector parallel exclusive prefix sum. @@ -869,6 +870,32 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( parallel_scan(loop_boundaries, closure, Kokkos::Sum{dummy}); } +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure, ValueType& return_val) { + // Extract ValueType from the Closure + using closure_value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, + void>::value_type; + static_assert(std::is_same::value, + "Non-matching value types of closure and return type"); + + ValueType accum; + parallel_scan(loop_boundaries, closure, Kokkos::Sum{accum}); + + return_val = accum; +} + } // namespace Kokkos namespace Kokkos { diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 5a0475a2058..c6fa182938e 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -607,8 +607,8 @@ struct functor_vec_scan { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ + !defined(KOKKOS_ENABLE_HPX) template struct functor_vec_scan_ret_val { using policy_type = Kokkos::TeamPolicy; @@ -735,8 +735,8 @@ bool test_scalar(int nteams, int team_size, int test) { } else if (test == 12) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ + !defined(KOKKOS_ENABLE_HPX) Kokkos::parallel_for( Kokkos::TeamPolicy(nteams, team_size, 8), functor_vec_scan_ret_val(d_flag, team_size)); From 8181d707581e32766c9f8b6f35da630ab3e7cb86 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 6 Oct 2023 08:43:56 -0600 Subject: [PATCH 042/432] Fix atomic operations bug for Min and Max (#6435) Fix atomic operations bug for Min and Max and add tests Co-authored-by: Daniel Arndt --- core/unit_test/TestAtomicOperations.hpp | 1421 +++++------------ .../TestAtomicOperations_complexdouble.hpp | 26 +- .../TestAtomicOperations_complexfloat.hpp | 26 +- .../unit_test/TestAtomicOperations_double.hpp | 19 +- core/unit_test/TestAtomicOperations_float.hpp | 19 +- core/unit_test/TestAtomicOperations_int.hpp | 29 +- .../TestAtomicOperations_longint.hpp | 29 +- .../TestAtomicOperations_longlongint.hpp | 29 +- .../TestAtomicOperations_unsignedint.hpp | 33 +- .../TestAtomicOperations_unsignedlongint.hpp | 33 +- ...stAtomicOperations_unsignedlonglongint.hpp | 36 + ...da_cc7_asm_atomic_fetch_op.inc_forceglobal | 153 -- .../cuda_cc7_asm_atomic_fetch_op.inc_generic | 151 -- .../cuda_cc7_asm_atomic_fetch_op.inc_isglobal | 57 +- ...cuda_cc7_asm_atomic_fetch_op.inc_predicate | 54 +- .../cuda_cc7_asm_atomic_op.inc_forceglobal | 64 - .../cuda/cuda_cc7_asm_atomic_op.inc_generic | 64 - .../cuda/cuda_cc7_asm_atomic_op.inc_isglobal | 55 +- .../cuda/cuda_cc7_asm_atomic_op.inc_predicate | 55 +- 19 files changed, 622 insertions(+), 1731 deletions(-) create mode 100644 core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic diff --git a/core/unit_test/TestAtomicOperations.hpp b/core/unit_test/TestAtomicOperations.hpp index 9965041e52c..a5aebed4138 100644 --- a/core/unit_test/TestAtomicOperations.hpp +++ b/core/unit_test/TestAtomicOperations.hpp @@ -15,1146 +15,439 @@ //@HEADER #include +#include namespace TestAtomicOperations { -//----------------------------------------------- -//--------------zero_functor--------------------- -//----------------------------------------------- - -template -struct ZeroFunctor { - using execution_space = DEVICE_TYPE; - using type = typename Kokkos::View; - using h_type = typename Kokkos::View::HostMirror; - - type data; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { data() = 0; } +struct AddAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_add(ptr_op, update); + T old_val = Kokkos::atomic_fetch_add(ptr_fetch_op, update); + T new_val = Kokkos::atomic_add_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); + } + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old + update; + } + static const char* name() { return "add"; } }; -//----------------------------------------------- -//--------------init_functor--------------------- -//----------------------------------------------- - -template -struct InitFunctor { - using execution_space = DEVICE_TYPE; - using type = typename Kokkos::View; - using h_type = typename Kokkos::View::HostMirror; - - type data; - T init_value; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { data() = init_value; } - - InitFunctor(T _init_value) : init_value(_init_value) {} +struct SubAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_sub(ptr_op, update); + T old_val = Kokkos::atomic_fetch_sub(ptr_fetch_op, update); + T new_val = Kokkos::atomic_sub_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); + } + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old - update; + } + static const char* name() { return "sub"; } }; -//--------------------------------------------------- -//--------------atomic_load/store/assign--------------------- -//--------------------------------------------------- -template -struct LoadStoreFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { - T old = Kokkos::atomic_load(&data()); - if (old != i0) - Kokkos::abort("Kokkos Atomic Load didn't get the right value"); - Kokkos::atomic_store(&data(), i1); - Kokkos::atomic_assign(&data(), old); +struct IncAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T) { + Kokkos::atomic_inc(ptr_op); + T old_val = Kokkos::atomic_fetch_inc(ptr_fetch_op); + T new_val = Kokkos::atomic_inc_fetch(ptr_op_fetch); + return Kokkos::pair(old_val, new_val); } - LoadStoreFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T old, T) { + return old + 1; + } + static const char* name() { return "inc"; } }; -template -bool LoadStoreAtomicTest(T i0, T i1) { - using execution_space = typename DeviceType::execution_space; - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct LoadStoreFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - - Kokkos::deep_copy(h_data, data); - - return h_data() == i0; -} - -//--------------------------------------------------- -//--------------atomic_fetch_max--------------------- -//--------------------------------------------------- - -template -struct MaxFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { - // Kokkos::atomic_fetch_max( &data(), (T) 1 ); - Kokkos::atomic_fetch_max(&data(), (T)i1); +struct DecAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T) { + Kokkos::atomic_dec(ptr_op); + T old_val = Kokkos::atomic_fetch_dec(ptr_fetch_op); + T new_val = Kokkos::atomic_dec_fetch(ptr_op_fetch); + return Kokkos::pair(old_val, new_val); + } + template + KOKKOS_FUNCTION static T op(T old, T) { + return old - 1; } - MaxFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + static const char* name() { return "dec"; } }; -template -T MaxAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct MaxFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T MaxAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = (i0 > i1 ? i0 : i1); - - T val = *data; - delete[] data; - - return val; -} - -template -bool MaxAtomicTest(T i0, T i1) { - T res = MaxAtomic(i0, i1); - T resSerial = MaxAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = MaxAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct MaxAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_max(ptr_op, update); + T old_val = Kokkos::atomic_fetch_max(ptr_fetch_op, update); + T new_val = Kokkos::atomic_max_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_min--------------------- -//--------------------------------------------------- - -template -struct MinFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_fetch_min(&data(), (T)i1); } - - MinFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T old, T update) { + return update > old ? update : old; + } + static const char* name() { return "max"; } }; -template -T MinAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct MinFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T MinAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = (i0 < i1 ? i0 : i1); - - T val = *data; - delete[] data; - - return val; -} - -template -bool MinAtomicTest(T i0, T i1) { - T res = MinAtomic(i0, i1); - T resSerial = MinAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = MinAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct MinAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_min(ptr_op, update); + T old_val = Kokkos::atomic_fetch_min(ptr_fetch_op, update); + T new_val = Kokkos::atomic_min_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_increment--------------------- -//--------------------------------------------------- - -template -struct IncFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_increment(&data()); } - - IncFunctor(T _i0) : i0(_i0) {} + template + KOKKOS_FUNCTION static T op(T old, T update) { + return update < old ? update : old; + } + static const char* name() { return "min"; } }; -template -T IncAtomic(T i0) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct IncFunctor f(i0); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T IncAtomicCheck(T i0) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 + 1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool IncAtomicTest(T i0) { - T res = IncAtomic(i0); - T resSerial = IncAtomicCheck(i0); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = IncAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct MulAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_mul(ptr_op, update); + T old_val = Kokkos::atomic_fetch_mul(ptr_fetch_op, update); + T new_val = Kokkos::atomic_mul_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//-------------atomic_wrapping_increment------------- -//--------------------------------------------------- - -template -struct WrappingIncFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { - desul::atomic_fetch_inc_mod(&data(), (T)i1, desul::MemoryOrderRelaxed(), - desul::MemoryScopeDevice()); + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old * update; } - - WrappingIncFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + static const char* name() { return "mul"; } }; -template -T WrappingIncAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct WrappingIncFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T WrappingIncAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - // Wraps to 0 when i0 >= i1 - *data = ((i0 >= i1) ? (T)0 : i0 + (T)1); - - T val = *data; - delete[] data; - - return val; -} - -template -bool WrappingIncAtomicTest(T i0, T i1) { - T res = WrappingIncAtomic(i0, i1); - T resSerial = WrappingIncAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = WrappingIncAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct DivAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_div(ptr_op, update); + T old_val = Kokkos::atomic_fetch_div(ptr_fetch_op, update); + T new_val = Kokkos::atomic_div_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_decrement--------------------- -//--------------------------------------------------- - -template -struct DecFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_decrement(&data()); } - - DecFunctor(T _i0) : i0(_i0) {} + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old / update; + } + static const char* name() { return "div"; } }; -template -T DecAtomic(T i0) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct DecFunctor f(i0); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T DecAtomicCheck(T i0) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 - 1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool DecAtomicTest(T i0) { - T res = DecAtomic(i0); - T resSerial = DecAtomicCheck(i0); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = DecAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct ModAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + // Kokkos::atomic_mod(ptr_op, update); + (void)Kokkos::atomic_fetch_mod(ptr_op, update); + T old_val = Kokkos::atomic_fetch_mod(ptr_fetch_op, update); + T new_val = Kokkos::atomic_mod_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//-------------atomic_wrapping_decrement------------- -//--------------------------------------------------- - -template -struct WrappingDecFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { - desul::atomic_fetch_dec_mod(&data(), (T)i1, desul::MemoryOrderRelaxed(), - desul::MemoryScopeDevice()); + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old % update; } - - WrappingDecFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + static const char* name() { return "mod"; } }; -template -T WrappingDecAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct WrappingDecFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T WrappingDecAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - // Wraps to i1 when i0 <= 0 - // i0 should never be negative - *data = ((i0 <= (T)0) ? i1 : i0 - (T)1); - - T val = *data; - delete[] data; - - return val; -} - -template -bool WrappingDecAtomicTest(T i0, T i1) { - T res = WrappingDecAtomic(i0, i1); - T resSerial = WrappingDecAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = WrappingDecAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct AndAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_and(ptr_op, update); + T old_val = Kokkos::atomic_fetch_and(ptr_fetch_op, update); + T new_val = Kokkos::atomic_and_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_mul--------------------- -//--------------------------------------------------- - -template -struct MulFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_fetch_mul(&data(), (T)i1); } - - MulFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old & update; + } + static const char* name() { return "and"; } }; -template -T MulAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct MulFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T MulAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 * i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool MulAtomicTest(T i0, T i1) { - T res = MulAtomic(i0, i1); - T resSerial = MulAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = MulAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct OrAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + Kokkos::atomic_or(ptr_op, update); + T old_val = Kokkos::atomic_fetch_or(ptr_fetch_op, update); + T new_val = Kokkos::atomic_or_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_div--------------------- -//--------------------------------------------------- - -template -struct DivFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_fetch_div(&data(), (T)i1); } - - DivFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old | update; + } + static const char* name() { return "or"; } }; -template -T DivAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct DivFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T DivAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 / i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool DivAtomicTest(T i0, T i1) { - T res = DivAtomic(i0, i1); - T resSerial = DivAtomicCheck(i0, i1); - - bool passed = true; - - using Kokkos::abs; - if (abs((resSerial - res) * 1.) > 1e-5) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = DivAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct XorAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + // Kokkos::atomic_xor(ptr_op, update); + (void)Kokkos::atomic_fetch_xor(ptr_op, update); + T old_val = Kokkos::atomic_fetch_xor(ptr_fetch_op, update); + T new_val = Kokkos::atomic_xor_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_mod--------------------- -//--------------------------------------------------- - -template -struct ModFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_fetch_mod(&data(), (T)i1); } - - ModFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old ^ update; + } + static const char* name() { return "xor"; } }; -template -T ModAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct ModFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T ModAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 % i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool ModAtomicTest(T i0, T i1) { - T res = ModAtomic(i0, i1); - T resSerial = ModAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = ModAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct NandAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + // Kokkos::atomic_nand(ptr_op, update); + (void)Kokkos::atomic_fetch_nand(ptr_op, update); + T old_val = Kokkos::atomic_fetch_nand(ptr_fetch_op, update); + T new_val = Kokkos::atomic_nand_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_and--------------------- -//--------------------------------------------------- - -template -struct AndFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { - T result = Kokkos::atomic_fetch_and(&data(), (T)i1); - Kokkos::atomic_and(&data(), result); + template + KOKKOS_FUNCTION static T op(T old, T update) { + return ~(old & update); } - - AndFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + static const char* name() { return "nand"; } }; -template -T AndAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct AndFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T AndAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 & i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool AndAtomicTest(T i0, T i1) { - T res = AndAtomic(i0, i1); - T resSerial = AndAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = AndAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct LShiftAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + // Kokkos::atomic_lshift(ptr_op, update); + (void)Kokkos::atomic_fetch_lshift(ptr_op, update); + T old_val = Kokkos::atomic_fetch_lshift(ptr_fetch_op, update); + T new_val = Kokkos::atomic_lshift_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_or---------------------- -//--------------------------------------------------- - -template -struct OrFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { - T result = Kokkos::atomic_fetch_or(&data(), (T)i1); - Kokkos::atomic_or(&data(), result); + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old << update; } - - OrFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + static const char* name() { return "lshift"; } }; -template -T OrAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct OrFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T OrAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 | i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool OrAtomicTest(T i0, T i1) { - T res = OrAtomic(i0, i1); - T resSerial = OrAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = OrAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct RShiftAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + // Kokkos::atomic_rshift(ptr_op, update); not implemented + (void)Kokkos::atomic_fetch_rshift(ptr_op, update); + T old_val = Kokkos::atomic_fetch_rshift(ptr_fetch_op, update); + T new_val = Kokkos::atomic_rshift_fetch(ptr_op_fetch, update); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_xor--------------------- -//--------------------------------------------------- - -template -struct XorFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_fetch_xor(&data(), (T)i1); } - - XorFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T old, T update) { + return old >> update; + } + static const char* name() { return "rshift"; } }; -template -T XorAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct XorFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T XorAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 ^ i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool XorAtomicTest(T i0, T i1) { - T res = XorAtomic(i0, i1); - T resSerial = XorAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = XorAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct LoadStoreAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + T old_val = Kokkos::atomic_load(ptr_op); + Kokkos::atomic_store(ptr_op, update); + Kokkos::atomic_store(ptr_op_fetch, update); + Kokkos::atomic_store(ptr_fetch_op, update); + return Kokkos::pair(old_val, update); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_lshift--------------------- -//--------------------------------------------------- - -template -struct LShiftFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_fetch_lshift(&data(), (T)i1); } - - LShiftFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T, T update) { + return update; + } + static const char* name() { return "load/store"; } }; -template -T LShiftAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct LShiftFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T LShiftAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 << i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool LShiftAtomicTest(T i0, T i1) { - T res = LShiftAtomic(i0, i1); - T resSerial = LShiftAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = LShiftAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct IncModAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T wrap_value) { + // no atomic_inc_mod in desul + (void)desul::atomic_fetch_inc_mod(ptr_op, wrap_value, + desul::MemoryOrderRelaxed(), + desul::MemoryScopeDevice()); + T old_val = desul::atomic_fetch_inc_mod(ptr_fetch_op, wrap_value, + desul::MemoryOrderRelaxed(), + desul::MemoryScopeDevice()); + // no atomic_inc_mod_fetch in desul + (void)desul::atomic_fetch_inc_mod(ptr_op_fetch, wrap_value, + desul::MemoryOrderRelaxed(), + desul::MemoryScopeDevice()); + T new_val = op(old_val, wrap_value); + return Kokkos::pair(old_val, new_val); } - - return passed; -} - -//--------------------------------------------------- -//--------------atomic_fetch_rshift--------------------- -//--------------------------------------------------- - -template -struct RShiftFunctor { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View; - - type data; - T i0; - T i1; - - KOKKOS_INLINE_FUNCTION - void operator()(int) const { Kokkos::atomic_fetch_rshift(&data(), (T)i1); } - - RShiftFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} + template + KOKKOS_FUNCTION static T op(T old, T wrap_value) { + return old + 1 > wrap_value ? 0 : old + 1; + } + static const char* name() { return "inc_mod"; } }; -template -T RShiftAtomic(T i0, T i1) { - struct InitFunctor f_init(i0); - typename InitFunctor::type data("Data"); - typename InitFunctor::h_type h_data("HData"); - - f_init.data = data; - Kokkos::parallel_for(1, f_init); - execution_space().fence(); - - struct RShiftFunctor f(i0, i1); - - f.data = data; - Kokkos::parallel_for(1, f); - execution_space().fence(); - - Kokkos::deep_copy(h_data, data); - T val = h_data(); - - return val; -} - -template -T RShiftAtomicCheck(T i0, T i1) { - T* data = new T[1]; - data[0] = 0; - - *data = i0 >> i1; - - T val = *data; - delete[] data; - - return val; -} - -template -bool RShiftAtomicTest(T i0, T i1) { - T res = RShiftAtomic(i0, i1); - T resSerial = RShiftAtomicCheck(i0, i1); - - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() << ">( test = RShiftAtomicTest" - << " FAILED : " << resSerial << " != " << res << std::endl; +struct DecModAtomicTest { + template + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T wrap_value) { + // no atomic_dec_mod in desul + (void)desul::atomic_fetch_dec_mod(ptr_op, wrap_value, + desul::MemoryOrderRelaxed(), + desul::MemoryScopeDevice()); + T old_val = desul::atomic_fetch_dec_mod(ptr_fetch_op, wrap_value, + desul::MemoryOrderRelaxed(), + desul::MemoryScopeDevice()); + // no atomic_dec_mod_fetch in desul + (void)desul::atomic_fetch_dec_mod(ptr_op_fetch, wrap_value, + desul::MemoryOrderRelaxed(), + desul::MemoryScopeDevice()); + T new_val = op(old_val, wrap_value); + return Kokkos::pair(old_val, new_val); + } + template + KOKKOS_FUNCTION static T op(T old, T wrap_value) { + return ((old == 0) || (old > wrap_value)) ? wrap_value : old - 1; } + static const char* name() { return "dec_mod"; } +}; - return passed; +template +bool atomic_op_test(T old_val, T update) { + Kokkos::View op_data("op_data"); + Kokkos::deep_copy(op_data, old_val); + int result = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, 1), + KOKKOS_LAMBDA(int, int& local_result) { + auto fetch_result = + Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update); + T expected_val = Op::op(old_val, update); + Kokkos::memory_fence(); + if (op_data(0) != expected_val) local_result += 1; + if (op_data(1) != expected_val) local_result += 2; + if (op_data(2) != expected_val) local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fetch_result.second != expected_val) local_result += 16; + }, + result); + if ((result & 1) != 0) + printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name()); + if ((result & 2) != 0) + printf("atomic_fetch_%s failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 4) != 0) + printf("atomic_%s_fetch failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 8) != 0) + printf("atomic_fetch_%s did not return old value with type %s\n", + Op::name(), typeid(T).name()); + if ((result & 16) != 0) + printf("atomic_%s_fetch did not return updated value with type %s\n", + Op::name(), typeid(T).name()); + + return result == 0; } //--------------------------------------------------- //--------------atomic_test_control------------------ //--------------------------------------------------- -template -bool AtomicOperationsTestIntegralType(int i0, int i1, int test) { +template +bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { + T old_val = static_cast(old_val_in); + T update = static_cast(update_in); switch (test) { - case 1: return MaxAtomicTest((T)i0, (T)i1); - case 2: return MinAtomicTest((T)i0, (T)i1); - case 3: return MulAtomicTest((T)i0, (T)i1); - case 4: return DivAtomicTest((T)i0, (T)i1); - case 5: return ModAtomicTest((T)i0, (T)i1); - case 6: return AndAtomicTest((T)i0, (T)i1); - case 7: return OrAtomicTest((T)i0, (T)i1); - case 8: return XorAtomicTest((T)i0, (T)i1); - case 9: return LShiftAtomicTest((T)i0, (T)i1); - case 10: return RShiftAtomicTest((T)i0, (T)i1); - case 11: return IncAtomicTest((T)i0); - case 12: return DecAtomicTest((T)i0); - case 13: return LoadStoreAtomicTest((T)i0, (T)i1); + case 0: return atomic_op_test(old_val, update); + case 1: return atomic_op_test(old_val, update); + case 2: return atomic_op_test(old_val, update); + case 3: return atomic_op_test(old_val, update); + case 4: return atomic_op_test(old_val, update); + case 5: + return update != 0 + ? atomic_op_test(old_val, update) + : true; + case 6: + return update != 0 + ? atomic_op_test(old_val, update) + : true; + case 7: return atomic_op_test(old_val, update); + case 8: return atomic_op_test(old_val, update); + case 9: return atomic_op_test(old_val, update); + case 10: + return atomic_op_test(old_val, update); + case 11: + return update_in >= 0 ? atomic_op_test( + old_val, update) + : true; + case 12: + return update_in >= 0 ? atomic_op_test( + old_val, update) + : true; + case 13: + return atomic_op_test(old_val, update); + case 14: + return atomic_op_test(old_val, update); + case 15: + return atomic_op_test(old_val, update); } - return 0; + return true; } -template -bool AtomicOperationsTestUnsignedIntegralType(int i0, int i1, int test) { +template +bool AtomicOperationsTestUnsignedIntegralType(int old_val_in, int update_in, + int test) { + T old_val = static_cast(old_val_in); + T update = static_cast(update_in); switch (test) { - case 1: return WrappingIncAtomicTest((T)i0, (T)i1); - case 2: return WrappingDecAtomicTest((T)i0, (T)i1); + case 1: + return atomic_op_test(old_val, update); + case 2: + return atomic_op_test(old_val, update); } - return 0; + return true; } -template -bool AtomicOperationsTestNonIntegralType(int i0, int i1, int test) { +template +bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in, + int test) { + T old_val = static_cast(old_val_in); + T update = static_cast(update_in); switch (test) { - case 1: return MaxAtomicTest((T)i0, (T)i1); - case 2: return MinAtomicTest((T)i0, (T)i1); - case 3: return MulAtomicTest((T)i0, (T)i1); - case 4: return DivAtomicTest((T)i0, (T)i1); - case 5: return LoadStoreAtomicTest((T)i0, (T)i1); + case 0: return atomic_op_test(old_val, update); + case 1: return atomic_op_test(old_val, update); + case 2: return atomic_op_test(old_val, update); + case 3: return atomic_op_test(old_val, update); + case 4: return atomic_op_test(old_val, update); + case 5: + return update != 0 + ? atomic_op_test(old_val, update) + : true; + case 6: + return atomic_op_test(old_val, update); } - return 0; + return true; } - } // namespace TestAtomicOperations diff --git a/core/unit_test/TestAtomicOperations_complexdouble.hpp b/core/unit_test/TestAtomicOperations_complexdouble.hpp index 852fade58ba..5708fd2ebf7 100644 --- a/core/unit_test/TestAtomicOperations_complexdouble.hpp +++ b/core/unit_test/TestAtomicOperations_complexdouble.hpp @@ -16,6 +16,8 @@ #include +using namespace TestAtomicOperations; + namespace Test { TEST(TEST_CATEGORY, atomic_operations_complexdouble) { #if defined(KOKKOS_ENABLE_SYCL) && \ @@ -23,15 +25,29 @@ TEST(TEST_CATEGORY, atomic_operations_complexdouble) { if (std::is_same_v) GTEST_SKIP() << "skipping since device_global variables are not available"; #endif - const int start = 1; // Avoid zero for division. + const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { + using T = Kokkos::complex; + T old_val = static_cast(i); + T update = static_cast(end - i - start); + ASSERT_TRUE( + (atomic_op_test(old_val, update))); ASSERT_TRUE( - (TestAtomicOperations::MulAtomicTest, - TEST_EXECSPACE>(start, end - i))); + (atomic_op_test(old_val, update))); ASSERT_TRUE( - (TestAtomicOperations::DivAtomicTest, - TEST_EXECSPACE>(start, end - i))); + (atomic_op_test(old_val, update))); + + // FIXME_32BIT disable division test for 32bit where we have accuracy issues + // with division atomics still compile it though + if (sizeof(void*) == 8) { + ASSERT_TRUE((update != 0 + ? atomic_op_test( + old_val, update) + : true)); + } + ASSERT_TRUE((atomic_op_test( + old_val, update))); } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_complexfloat.hpp b/core/unit_test/TestAtomicOperations_complexfloat.hpp index b23e55a339e..97bfeea6ad9 100644 --- a/core/unit_test/TestAtomicOperations_complexfloat.hpp +++ b/core/unit_test/TestAtomicOperations_complexfloat.hpp @@ -16,17 +16,33 @@ #include +using namespace TestAtomicOperations; + namespace Test { TEST(TEST_CATEGORY, atomic_operations_complexfloat) { - const int start = 1; // Avoid zero for division. + const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { + using T = Kokkos::complex; + T old_val = static_cast(i); + T update = static_cast(end - i - start); + ASSERT_TRUE( + (atomic_op_test(old_val, update))); ASSERT_TRUE( - (TestAtomicOperations::MulAtomicTest, - TEST_EXECSPACE>(start, end - i))); + (atomic_op_test(old_val, update))); ASSERT_TRUE( - (TestAtomicOperations::DivAtomicTest, - TEST_EXECSPACE>(start, end - i))); + (atomic_op_test(old_val, update))); + + // FIXME_32BIT disable division test for 32bit where we have accuracy issues + // with division atomics still compile it though + if (sizeof(void*) == 8) { + ASSERT_TRUE((update != 0 + ? atomic_op_test( + old_val, update) + : true)); + } + ASSERT_TRUE((atomic_op_test( + old_val, update))); } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_double.hpp b/core/unit_test/TestAtomicOperations_double.hpp index 0dea91f4165..30f7e5e3bde 100644 --- a/core/unit_test/TestAtomicOperations_double.hpp +++ b/core/unit_test/TestAtomicOperations_double.hpp @@ -18,19 +18,16 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_double) { - const int start = 1; // Avoid zero for division. + const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - double, TEST_EXECSPACE>(start, end - i, 1))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - double, TEST_EXECSPACE>(start, end - i, 2))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - double, TEST_EXECSPACE>(start, end - i, 3))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - double, TEST_EXECSPACE>(start, end - i, 4))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - double, TEST_EXECSPACE>(start, end - i, 5))); + for (int t = 0; t < 8; t++) + // FIXME_32BIT disable division test for 32bit where we have accuracy + // issues with division atomics still compile it though + if (t != 5 || sizeof(void*) == 8) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + double, TEST_EXECSPACE>(i, end - i + start, t))); + } } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_float.hpp b/core/unit_test/TestAtomicOperations_float.hpp index 23348f20ec9..73ea4398089 100644 --- a/core/unit_test/TestAtomicOperations_float.hpp +++ b/core/unit_test/TestAtomicOperations_float.hpp @@ -18,19 +18,16 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_float) { - const int start = 1; // Avoid zero for division. + const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - float, TEST_EXECSPACE>(start, end - i, 1))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - float, TEST_EXECSPACE>(start, end - i, 2))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - float, TEST_EXECSPACE>(start, end - i, 3))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - float, TEST_EXECSPACE>(start, end - i, 4))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< - float, TEST_EXECSPACE>(start, end - i, 5))); + for (int t = 0; t < 8; t++) + // FIXME_32BIT disable division test for 32bit where we have accuracy + // issues with division atomics still compile it though + if (t != 5 || sizeof(void*) == 8) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + double, TEST_EXECSPACE>(i, end - i + start, t))); + } } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_int.hpp b/core/unit_test/TestAtomicOperations_int.hpp index 31cddf20baa..5aeaecd7af4 100644 --- a/core/unit_test/TestAtomicOperations_int.hpp +++ b/core/unit_test/TestAtomicOperations_int.hpp @@ -18,33 +18,12 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_int) { - const int start = 1; // Avoid zero for division. + const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 1))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 2))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 3))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 4))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 5))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 6))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 7))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 8))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 9))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 11))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 12))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - int, TEST_EXECSPACE>(start, end - i, 13))); + for (int t = 0; t < 16; t++) + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(i, end - i + start, t))); } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_longint.hpp b/core/unit_test/TestAtomicOperations_longint.hpp index 3171e61018f..b181171dd58 100644 --- a/core/unit_test/TestAtomicOperations_longint.hpp +++ b/core/unit_test/TestAtomicOperations_longint.hpp @@ -18,33 +18,12 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_long) { - const int start = 1; // Avoid zero for division. + const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 1))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 2))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 3))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 4))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 5))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 6))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 7))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 8))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 9))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 11))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 12))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long int, TEST_EXECSPACE>(start, end - i, 13))); + for (int t = 0; t < 16; t++) + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(i, end - i + start, t))); } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_longlongint.hpp b/core/unit_test/TestAtomicOperations_longlongint.hpp index f7bef416d63..aa21722f474 100644 --- a/core/unit_test/TestAtomicOperations_longlongint.hpp +++ b/core/unit_test/TestAtomicOperations_longlongint.hpp @@ -18,33 +18,12 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_longlong) { - const int start = 1; // Avoid zero for division. + const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 1))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 2))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 3))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 4))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 5))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 6))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 7))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 8))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 9))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 11))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 12))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - long long int, TEST_EXECSPACE>(start, end - i, 13))); + for (int t = 0; t < 16; t++) + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(i, end - i + start, t))); } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_unsignedint.hpp b/core/unit_test/TestAtomicOperations_unsignedint.hpp index f844c9062de..96acb94bb16 100644 --- a/core/unit_test/TestAtomicOperations_unsignedint.hpp +++ b/core/unit_test/TestAtomicOperations_unsignedint.hpp @@ -18,39 +18,18 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_unsigned) { - const int start = 1; // Avoid zero for division. + const int start = 0; const int end = 11; for (int i = start; i < end; ++i) { - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 1))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 2))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 3))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 4))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 5))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 6))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 7))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 8))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 9))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 11))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 12))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 13))); + for (int t = 0; t < 16; t++) + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(i, end - i + start, t))); ASSERT_TRUE( (TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 1))); // Wrapping Inc + unsigned int, TEST_EXECSPACE>(i, end - i, 1))); // Wrapping Inc ASSERT_TRUE( (TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< - unsigned int, TEST_EXECSPACE>(start, end - i, 2))); // Wrapping Dec + unsigned int, TEST_EXECSPACE>(i, end - i, 2))); // Wrapping Dec } } } // namespace Test diff --git a/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/core/unit_test/TestAtomicOperations_unsignedlongint.hpp index 8b6ca64e995..3482f6fe1ed 100644 --- a/core/unit_test/TestAtomicOperations_unsignedlongint.hpp +++ b/core/unit_test/TestAtomicOperations_unsignedlongint.hpp @@ -18,38 +18,17 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_unsignedlong) { - const int start = 1; // Avoid zero for division. + const int start = 0; const int end = 11; for (int i = start; i < end; ++i) { - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 1))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 2))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 3))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 4))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 5))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 6))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 7))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 8))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 9))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 11))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 12))); - ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, 13))); + for (int t = 0; t < 16; t++) + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(i, end - i + start, t))); ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, + unsigned long int, TEST_EXECSPACE>(i, end - i, 1))); // Wrapping Inc ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< - unsigned long int, TEST_EXECSPACE>(start, end - i, + unsigned long int, TEST_EXECSPACE>(i, end - i, 2))); // Wrapping Dec } } diff --git a/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp b/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp new file mode 100644 index 00000000000..cf41dedccb7 --- /dev/null +++ b/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp @@ -0,0 +1,36 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_unsignedlonglong) { + const int start = 0; + const int end = 11; + for (int i = start; i < end; ++i) { + for (int t = 0; t < 16; t++) + ASSERT_TRUE( + (TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long long int, TEST_EXECSPACE>(i, end - i + start, t))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< + unsigned long long int, TEST_EXECSPACE>(i, end - i, + 1))); // Wrapping Inc + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestUnsignedIntegralType< + unsigned long long int, TEST_EXECSPACE>(i, end - i, + 2))); // Wrapping Dec + } +} +} // namespace Test diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal deleted file mode 100644 index b2351638207..00000000000 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal +++ /dev/null @@ -1,153 +0,0 @@ - -// Inline PTX: h u16 , r u32, l u64, f f32, d f64 -// Ops: - -// binary operations - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint32_t asm_value = reinterpret_cast(value); \ - uint32_t asm_result = 0u; \ - asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint64_t asm_value = reinterpret_cast(value); \ - uint64_t asm_result = 0u; \ - asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint32_t asm_value = reinterpret_cast(value); \ - uint32_t asm_result = 0u; \ - asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint64_t asm_value = reinterpret_cast(value); \ - uint64_t asm_result = 0u; \ - asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint32_t asm_value = reinterpret_cast(value); \ - uint32_t asm_result = 0u; \ - asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint64_t asm_value = reinterpret_cast(value); \ - uint64_t asm_result = 0u; \ - asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} - -// Fetch atomics -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - ctype neg_value = -value; \ - asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \ - return result; \ -} - - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - ctype limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} \ -inline __device__ ctype atomic_fetch_inc_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - ctype limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} \ -inline __device__ ctype atomic_fetch_dec_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} - -// Group ops for integer ctypes -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() - - -// Instantiate Functions -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() - -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND - diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic deleted file mode 100644 index 0484d109c3d..00000000000 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic +++ /dev/null @@ -1,151 +0,0 @@ - -// Inline PTX: h u16 , r u32, l u64, f f32, d f64 -// Ops: - -// binary operations - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint32_t asm_value = reinterpret_cast(value); \ - uint32_t asm_result = 0u; \ - asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint64_t asm_value = reinterpret_cast(value); \ - uint64_t asm_result = 0u; \ - asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint32_t asm_value = reinterpret_cast(value); \ - uint32_t asm_result = 0u; \ - asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint64_t asm_value = reinterpret_cast(value); \ - uint64_t asm_result = 0u; \ - asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint32_t asm_value = reinterpret_cast(value); \ - uint32_t asm_result = 0u; \ - asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} \ -template \ -inline __device__ typename std::enable_if::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - uint64_t asm_value = reinterpret_cast(value); \ - uint64_t asm_result = 0u; \ - asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \ - return reinterpret_cast(asm_result); \ -} - -// Fetch atomics -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - ctype neg_value = -value; \ - asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - asm volatile("atom.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result=0; \ - asm volatile("atom.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - ctype limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} \ -inline __device__ ctype atomic_fetch_inc_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - ctype limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} \ -inline __device__ ctype atomic_fetch_dec_mod(ctype* dest, ctype limit, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - ctype result = 0; \ - asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \ - return result; \ -} -// Group ops for integer ctypes -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() - - -// Instantiate Functions -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() - -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC -#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND - diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal index ef5798f2113..dd359405b5e 100644 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal +++ b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal @@ -164,39 +164,56 @@ inline __device__ ctype device_atomic_fetch_dec_mod(ctype* dest, ctype limit, __ return result; \ } -// Group ops for integer ctypes -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - #define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() - // Instantiate Functions + +// General comments: +// - float/double only support add +// - inc/dec only supported with uint32_t +// - int64_t does not support add + +// floating point types __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l") - +// uint32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint32_t,".u32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint32_t,".u32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint32_t,".u32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint32_t,".u32","r","=r") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r") +// uint64_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint64_t,".u64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint64_t,".u64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint64_t,".u64","l","=l") + +// int32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int32_t,".s32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int32_t,".s32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int32_t,".s32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int32_t,".s32","r","=r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int32_t,".s32","r","=r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int32_t,".s32","r","=r") + +// int64_t note: add/sub is using unsigned register +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int64_t,".s64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int64_t,".s64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int64_t,".s64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int64_t,".s64","l","=l") + __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD @@ -205,4 +222,4 @@ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND - +#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate index c80efc5e7cf..2c0fe9e132c 100644 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate +++ b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate @@ -206,19 +206,6 @@ inline __device__ ctype device_atomic_fetch_dec_mod(ctype* dest, ctype limit, __ return result; \ } -// Group ops for integer ctypes -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) - #define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() \ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \ @@ -226,19 +213,50 @@ __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() // Instantiate Functions + +// General comments: +// - float/double only support add +// - inc/dec only supported with uint32_t +// - int64_t does not support add + +// floating point types __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l") - +// uint32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint32_t,".u32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint32_t,".u32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint32_t,".u32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint32_t,".u32","r","=r") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r") __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r") +// uint64_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(uint64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(uint64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(uint64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(uint64_t,".u64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint64_t,".u64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint64_t,".u64","l","=l") + +// int32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int32_t,".s32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int32_t,".s32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int32_t,".s32","r","=r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int32_t,".s32","r","=r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int32_t,".s32","r","=r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int32_t,".s32","r","=r") + +// int64_t note: add/sub is using unsigned register +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(int64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(int64_t,".u64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(int64_t,".s64","l","=l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(int64_t,".s64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(int64_t,".s64","l","=l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(int64_t,".s64","l","=l") + __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP() #undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal deleted file mode 100644 index 3767b2ab498..00000000000 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal +++ /dev/null @@ -1,64 +0,0 @@ - -// Inline PTX: h u16 , r u32, l u64, f f32, d f64 -// Ops: - -// Non Returning Atomic Operations -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - type neg_value = -value; \ - asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \ -inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \ -inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - type limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \ -inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - type limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \ -} - -// Group ops for integer types -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) - -// Instantiate Functions -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l") diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic deleted file mode 100644 index 5de36a3e0a8..00000000000 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic +++ /dev/null @@ -1,64 +0,0 @@ - -// Inline PTX: h u16 , r u32, l u64, f f32, d f64 -// Ops: - -// Non Returning Atomic Operations -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - type neg_value = -value; \ - asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - asm volatile("red.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \ -inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - asm volatile("red.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \ -inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - type limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("red.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \ -} - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \ -inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \ - type limit = desul::Impl::numeric_limits_max::value; \ - asm volatile("red.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \ -} - -// Group ops for integer types -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) - -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) - -// Instantiate Functions -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f") -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r") - -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l") diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal index 7bc6d4d9d19..6b6e3593fdd 100644 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal +++ b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal @@ -60,29 +60,48 @@ inline __device__ void device_atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMOR } \ } -// Group ops for integer types -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) +// Instantiate Functions -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) +// General comments: +// - float/double only support add +// - inc/dec only supported with uint32_t +// - int64_t does not support add -// Instantiate Functions +// floating point types __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f") __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f") __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d") __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r") +// uint32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint32_t,".u32","r") + +// uint64_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint64_t,".u64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint64_t,".u64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint64_t,".u64","l") + +// int32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int32_t,".s32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int32_t,".s32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int32_t,".s32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int32_t,".s32","r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int32_t,".s32","r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int32_t,".s32","r") + +// int64_t note: add/sub is using unsigned register +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int64_t,".s64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int64_t,".s64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int64_t,".s64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int64_t,".s64","l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l") diff --git a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate index 4ae8e46266e..b9569faf1bd 100644 --- a/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate +++ b/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate @@ -78,29 +78,48 @@ inline __device__ void device_atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMOR :: "l"(dest),reg_type(limit) : "memory"); \ } -// Group ops for integer types -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) +// Instantiate Functions -#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \ -__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) +// General comments: +// - float/double only support add +// - inc/dec only supported with uint32_t +// - int64_t does not support add -// Instantiate Functions +// floating point types __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f") __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f") __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d") __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d") -__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r") +// uint32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint32_t,".u32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint32_t,".u32","r") + +// uint64_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(uint64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(uint64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(uint64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(uint64_t,".u64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(uint64_t,".u64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(uint64_t,".u64","l") + +// int32_t +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int32_t,".s32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int32_t,".s32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int32_t,".s32","r") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int32_t,".s32","r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int32_t,".s32","r") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int32_t,".s32","r") + +// int64_t note: add/sub are using unsigned register! +__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(int64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(int64_t,".u64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(int64_t,".s64","l") +__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(int64_t,".s64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(int64_t,".s64","l") +//__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(int64_t,".s64","l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l") -__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r") -//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l") From 8659ffa0bccafd446e40ad43952d0a8b93f93cc1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sun, 8 Oct 2023 20:16:56 -0400 Subject: [PATCH 043/432] Fix example/build_cmake_installed_different_compiler --- example/build_cmake_installed_different_compiler/foo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/build_cmake_installed_different_compiler/foo.cpp b/example/build_cmake_installed_different_compiler/foo.cpp index e17d34aab15..7630802ae95 100644 --- a/example/build_cmake_installed_different_compiler/foo.cpp +++ b/example/build_cmake_installed_different_compiler/foo.cpp @@ -15,7 +15,7 @@ //@HEADER #include -#include +#include struct CountFunctor { KOKKOS_FUNCTION void operator()(const long i, long& lcount) const { From 1f4e3d5dbc08fed9b49d6658f6a6fd7a6df90d93 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Mon, 9 Oct 2023 16:08:35 +0200 Subject: [PATCH 044/432] fix impl --- .../src/std_algorithms/impl/Kokkos_CopyIf.hpp | 40 +++++++----- .../impl/Kokkos_MustUseKokkosSingleInTeam.hpp | 64 +++++++++++++++++++ .../std_algorithms/impl/Kokkos_UniqueCopy.hpp | 61 ++++++++++-------- 3 files changed, 123 insertions(+), 42 deletions(-) create mode 100644 algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp diff --git a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp index d23e5398479..79f342e81ba 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp @@ -20,6 +20,7 @@ #include #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" +#include "Kokkos_MustUseKokkosSingleInTeam.hpp" #include #include @@ -122,25 +123,32 @@ KOKKOS_FUNCTION OutputIterator copy_if_team_impl( return d_first; } - // FIXME: there is no parallel_scan overload that accepts TeamThreadRange and - // return_value, so temporarily serial implementation is used here const std::size_t num_elements = Kokkos::Experimental::distance(first, last); - std::size_t count = 0; - Kokkos::single( - Kokkos::PerTeam(teamHandle), - [=](std::size_t& lcount) { - lcount = 0; - for (std::size_t i = 0; i < num_elements; ++i) { - const auto& myval = first[i]; - if (pred(myval)) { - d_first[lcount++] = myval; + if constexpr (stdalgo_must_use_kokkos_single_for_team_scan< + typename TeamHandleType::execution_space>::value) { + std::size_t count = 0; + Kokkos::single( + Kokkos::PerTeam(teamHandle), + [=](std::size_t& lcount) { + lcount = 0; + for (std::size_t i = 0; i < num_elements; ++i) { + const auto& myval = first[i]; + if (pred(myval)) { + d_first[lcount++] = myval; + } } - } - }, - count); - // no barrier needed since single above broadcasts to all members + }, + count); + // no barrier needed since single above broadcasts to all members + return d_first + count; - return d_first + count; + } else { + typename InputIterator::difference_type count = 0; + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + StdCopyIfFunctor(first, d_first, pred), count); + // no barrier needed because of the scan accumulating into count + return d_first + count; + } } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp b/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp new file mode 100644 index 00000000000..62e2636f2ac --- /dev/null +++ b/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_STD_ALGORITHMS_MUSTUSEKOKKOSSINGLEINTEAM_HPP +#define KOKKOS_STD_ALGORITHMS_MUSTUSEKOKKOSSINGLEINTEAM_HPP + +#include + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template +struct stdalgo_must_use_kokkos_single_for_team_scan : std::false_type {}; + +// the following do not support the overload for team-level scan +// accepting an "out" value to store the scan result + +// FIXME_OPENACC +#if defined(KOKKOS_ENABLE_OPENACC) +template <> +struct stdalgo_must_use_kokkos_single_for_team_scan< + Kokkos::Experimental::OpenACC> : std::true_type {}; +#endif + +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +template <> +struct stdalgo_must_use_kokkos_single_for_team_scan< + Kokkos::Experimental::OpenMPTarget> : std::true_type {}; +#endif + +// FIXME_HPX +#if defined(KOKKOS_ENABLE_HPX) +template <> +struct stdalgo_must_use_kokkos_single_for_team_scan + : std::true_type {}; +#endif + +// FIXME_THREADS +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct stdalgo_must_use_kokkos_single_for_team_scan + : std::true_type {}; +#endif + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp index 5540c3a81e2..5084f57b14e 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp @@ -20,6 +20,7 @@ #include #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" +#include "Kokkos_MustUseKokkosSingleInTeam.hpp" #include "Kokkos_CopyCopyN.hpp" #include #include @@ -138,33 +139,41 @@ KOKKOS_FUNCTION OutputIterator unique_copy_team_impl( } else if (num_elements == 1) { d_first[0] = first[0]; return d_first + 1; - } else { - // FIXME: parallel_scan is what we used for the execution space impl, - // but parallel_scan does not support TeamThreadRange, so for the - // team-level impl we do this serially for now and later figure out - // if this can be done in parallel - - std::size_t count = 0; - Kokkos::single( - Kokkos::PerTeam(teamHandle), - [=](std::size_t& lcount) { - lcount = 0; - for (std::size_t i = 0; i < num_elements - 1; ++i) { - const auto& val_i = first[i]; - const auto& val_ip1 = first[i + 1]; - if (!pred(val_i, val_ip1)) { - d_first[lcount++] = val_i; + } + + else { + if constexpr (stdalgo_must_use_kokkos_single_for_team_scan< + typename TeamHandleType::execution_space>::value) { + std::size_t count = 0; + Kokkos::single( + Kokkos::PerTeam(teamHandle), + [=](std::size_t& lcount) { + lcount = 0; + for (std::size_t i = 0; i < num_elements - 1; ++i) { + const auto& val_i = first[i]; + const auto& val_ip1 = first[i + 1]; + if (!pred(val_i, val_ip1)) { + d_first[lcount++] = val_i; + } } - } - // we need to copy the last element always - d_first[lcount++] = first[num_elements - 1]; - }, - count); - // no barrier needed since single above broadcasts to all members - - // return the correct iterator: we need +1 here because we need to - // return iterator to the element past the last element copied - return d_first + count; + // we need to copy the last element always + d_first[lcount++] = first[num_elements - 1]; + }, + count); + // no barrier needed since single above broadcasts to all members + + return d_first + count; + } else { + const auto scan_size = num_elements - 1; + std::size_t count = 0; + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, scan_size), + StdUniqueCopyFunctor(first, last, d_first, pred), + count); + // no barrier needed since reducing into count + + return Impl::copy_team_impl(teamHandle, first + scan_size, last, + d_first + count); + } } } From f5c0cc5a4c06c7259e7e956ece51366e933729c7 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Mon, 9 Oct 2023 09:08:39 -0600 Subject: [PATCH 045/432] Update core/src/HIP/Kokkos_HIP_KernelLaunch.hpp --- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 5aa34feea22..7cd0afcf47f 100644 --- a/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -394,7 +394,7 @@ struct HIPParallelLaunchKernelInvoker Date: Mon, 9 Oct 2023 13:02:01 -0400 Subject: [PATCH 046/432] Split Kokkos_Threads_Parallel files --- core/src/Threads/Kokkos_Threads.hpp | 16 - core/src/Threads/Kokkos_ThreadsExec.hpp | 2 + .../Kokkos_Threads_ParallelFor_MDRange.hpp | 115 +++++ .../Kokkos_Threads_ParallelFor_Range.hpp | 122 +++++ .../Kokkos_Threads_ParallelFor_Team.hpp | 118 +++++ ...Kokkos_Threads_ParallelReduce_MDRange.hpp} | 91 +--- .../Kokkos_Threads_ParallelReduce_Range.hpp | 171 +++++++ ...=> Kokkos_Threads_ParallelReduce_Team.hpp} | 94 +--- .../Kokkos_Threads_ParallelScan_Range.hpp | 198 ++++++++ .../Threads/Kokkos_Threads_Parallel_Range.hpp | 435 ------------------ .../Kokkos_Threads_WorkGraphPolicy.hpp | 2 +- core/src/decl/Kokkos_Declare_THREADS.hpp | 10 + 12 files changed, 741 insertions(+), 633 deletions(-) create mode 100644 core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp create mode 100644 core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp create mode 100644 core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp rename core/src/Threads/{Kokkos_Threads_Parallel_MDRange.hpp => Kokkos_Threads_ParallelReduce_MDRange.hpp} (65%) create mode 100644 core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp rename core/src/Threads/{Kokkos_Threads_Parallel_Team.hpp => Kokkos_Threads_ParallelReduce_Team.hpp} (59%) create mode 100644 core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp delete mode 100644 core/src/Threads/Kokkos_Threads_Parallel_Range.hpp diff --git a/core/src/Threads/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp index db3b771f2b4..c0d70c03ecb 100644 --- a/core/src/Threads/Kokkos_Threads.hpp +++ b/core/src/Threads/Kokkos_Threads.hpp @@ -169,21 +169,5 @@ struct MemorySpaceAccess -#include -#include -#include -#include -#include -#include -#include - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - #endif /* #if defined( KOKKOS_ENABLE_THREADS ) */ #endif /* #define KOKKOS_THREADS_HPP */ diff --git a/core/src/Threads/Kokkos_ThreadsExec.hpp b/core/src/Threads/Kokkos_ThreadsExec.hpp index 35e815828a6..377e096bfbe 100644 --- a/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -26,8 +26,10 @@ #include #include +#include #include +#include //---------------------------------------------------------------------------- diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp new file mode 100644 index 00000000000..0828f262993 --- /dev/null +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -0,0 +1,115 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_PARALLEL_FOR_MDRANGE_HPP +#define KOKKOS_THREADS_PARALLEL_FOR_MDRANGE_HPP + +#include + +#include +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Threads> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy; + using Policy = typename MDRangePolicy::impl_range_policy; + + using WorkTag = typename MDRangePolicy::work_tag; + + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; + + const iterate_type m_iter; + + inline void exec_range(const Member ibeg, const Member iend) const { + for (Member i = ibeg; i < iend; ++i) { + m_iter(i); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + exec_schedule(exec, arg); + } + + template + static std::enable_if_t::value> + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + auto const num_tiles = self.m_iter.m_rp.m_num_tiles; + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), + exec.pool_size()); + + self.exec_range(range.begin(), range.end()); + + exec.fan_in(); + } + + template + static std::enable_if_t::value> + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + auto const num_tiles = self.m_iter.m_rp.m_num_tiles; + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), + exec.pool_size()); + + exec.set_work_range(range.begin(), range.end(), 1); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + + while (work_index != -1) { + const Member begin = static_cast(work_index); + const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; + + self.exec_range(begin, end); + work_index = exec.get_work_index(); + } + + exec.fan_in(); + } + + public: + inline void execute() const { + ThreadsExec::start(&ParallelFor::exec, this); + ThreadsExec::fence(); + } + + ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) + : m_iter(arg_policy, arg_functor) {} + + template + static int max_tile_size_product(const Policy &, const Functor &) { + /** + * 1024 here is just our guess for a reasonable max tile size, + * it isn't a hardware constraint. If people see a use for larger + * tile size products, we're happy to change this. + */ + return 1024; + } +}; + +} // namespace Impl +} // namespace Kokkos +#endif diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp new file mode 100644 index 00000000000..3698416ef18 --- /dev/null +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -0,0 +1,122 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_THREADS_PARALLEL_FOR_RANGE_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member ibeg, const Member iend) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i); + } + } + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member ibeg, const Member iend) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + exec_schedule(exec, arg); + } + + template + static std::enable_if_t::value> + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + ParallelFor::template exec_range(self.m_functor, range.begin(), + range.end()); + + exec.fan_in(); + } + + template + static std::enable_if_t::value> + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + exec.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + + while (work_index != -1) { + const Member begin = + static_cast(work_index) * self.m_policy.chunk_size() + + self.m_policy.begin(); + const Member end = + begin + self.m_policy.chunk_size() < self.m_policy.end() + ? begin + self.m_policy.chunk_size() + : self.m_policy.end(); + ParallelFor::template exec_range(self.m_functor, begin, end); + work_index = exec.get_work_index(); + } + + exec.fan_in(); + } + + public: + inline void execute() const { + ThreadsExec::start(&ParallelFor::exec, this); + ThreadsExec::fence(); + } + + ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp new file mode 100644 index 00000000000..36404857a22 --- /dev/null +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -0,0 +1,118 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_THREADS_PARALLEL_FOR_TEAM_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Threads> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + const size_t m_shared; + + template + inline static std::enable_if_t::value && + std::is_same::value> + exec_team(const FunctorType &functor, Member member) { + for (; member.valid_static(); member.next_static()) { + functor(member); + } + } + + template + inline static std::enable_if_t::value && + std::is_same::value> + exec_team(const FunctorType &functor, Member member) { + const TagType t{}; + for (; member.valid_static(); member.next_static()) { + functor(t, member); + } + } + + template + inline static std::enable_if_t::value && + std::is_same::value> + exec_team(const FunctorType &functor, Member member) { + for (; member.valid_dynamic(); member.next_dynamic()) { + functor(member); + } + } + + template + inline static std::enable_if_t::value && + std::is_same::value> + exec_team(const FunctorType &functor, Member member) { + const TagType t{}; + for (; member.valid_dynamic(); member.next_dynamic()) { + functor(t, member); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + ParallelFor::exec_team( + self.m_functor, Member(&exec, self.m_policy, self.m_shared)); + + exec.barrier(); + exec.fan_in(); + } + template + Policy fix_policy(Policy policy) { + if (policy.impl_vector_length() < 0) { + policy.impl_set_vector_length(1); + } + if (policy.team_size() < 0) { + policy.impl_set_team_size( + policy.team_size_recommended(m_functor, ParallelForTag{})); + } + return policy; + } + + public: + inline void execute() const { + ThreadsExec::resize_scratch( + 0, Policy::member_type::team_reduce_size() + m_shared); + + ThreadsExec::start(&ParallelFor::exec, this); + + ThreadsExec::fence(); + } + + ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), + m_policy(fix_policy(arg_policy)), + m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor, m_policy.team_size())) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp similarity index 65% rename from core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp rename to core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index 9d06249082b..3d06379480f 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_THREADS_PARALLEL_MDRANGE_HPP -#define KOKKOS_THREADS_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_THREADS_PARALLEL_REDUCE_MDRANGE_HPP +#define KOKKOS_THREADS_PARALLEL_REDUCE_MDRANGE_HPP #include @@ -24,93 +24,6 @@ namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Threads> { - private: - using MDRangePolicy = Kokkos::MDRangePolicy; - using Policy = typename MDRangePolicy::impl_range_policy; - - using WorkTag = typename MDRangePolicy::work_tag; - - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - - using iterate_type = typename Kokkos::Impl::HostIterateTile< - MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; - - const iterate_type m_iter; - - inline void exec_range(const Member ibeg, const Member iend) const { - for (Member i = ibeg; i < iend; ++i) { - m_iter(i); - } - } - - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); - } - - template - static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { - const ParallelFor &self = *((const ParallelFor *)arg); - - auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); - - self.exec_range(range.begin(), range.end()); - - exec.fan_in(); - } - - template - static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { - const ParallelFor &self = *((const ParallelFor *)arg); - - auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); - - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); - - long work_index = exec.get_work_index(); - - while (work_index != -1) { - const Member begin = static_cast(work_index); - const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; - - self.exec_range(begin, end); - work_index = exec.get_work_index(); - } - - exec.fan_in(); - } - - public: - inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); - } - - ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) - : m_iter(arg_policy, arg_functor) {} - - template - static int max_tile_size_product(const Policy &, const Functor &) { - /** - * 1024 here is just our guess for a reasonable max tile size, - * it isn't a hardware constraint. If people see a use for larger - * tile size products, we're happy to change this. - */ - return 1024; - } -}; - template class ParallelReduce, Kokkos::Threads> { diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp new file mode 100644 index 00000000000..5fa97b403c4 --- /dev/null +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp @@ -0,0 +1,171 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_PARALLEL_REDUCE_RANGE_HPP +#define KOKKOS_THREADS_PARALLEL_REDUCE_RANGE_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, + Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member &ibeg, const Member &iend, + reference_type update) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i, update); + } + } + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member &ibeg, const Member &iend, + reference_type update) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i, update); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + exec_schedule(exec, arg); + } + + template + static std::enable_if_t::value> + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelReduce &self = *((const ParallelReduce *)arg); + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + const ReducerType &reducer = self.m_functor_reducer.get_reducer(); + + ParallelReduce::template exec_range( + self.m_functor_reducer.get_functor(), range.begin(), range.end(), + reducer.init(static_cast(exec.reduce_memory()))); + + exec.fan_in_reduce(reducer); + } + + template + static std::enable_if_t::value> + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelReduce &self = *((const ParallelReduce *)arg); + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + exec.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + const ReducerType &reducer = self.m_functor_reducer.get_reducer(); + + reference_type update = + reducer.init(static_cast(exec.reduce_memory())); + while (work_index != -1) { + const Member begin = + static_cast(work_index) * self.m_policy.chunk_size() + + self.m_policy.begin(); + const Member end = + begin + self.m_policy.chunk_size() < self.m_policy.end() + ? begin + self.m_policy.chunk_size() + : self.m_policy.end(); + ParallelReduce::template exec_range( + self.m_functor_reducer.get_functor(), begin, end, update); + work_index = exec.get_work_index(); + } + + exec.fan_in_reduce(reducer); + } + + public: + inline void execute() const { + const ReducerType &reducer = m_functor_reducer.get_reducer(); + + if (m_policy.end() <= m_policy.begin()) { + if (m_result_ptr) { + reducer.init(m_result_ptr); + reducer.final(m_result_ptr); + } + } else { + ThreadsExec::resize_scratch(reducer.value_size(), 0); + + ThreadsExec::start(&ParallelReduce::exec, this); + + ThreadsExec::fence(); + + if (m_result_ptr) { + const pointer_type data = + (pointer_type)ThreadsExec::root_reduce_scratch(); + + const unsigned n = reducer.value_count(); + for (unsigned i = 0; i < n; ++i) { + m_result_ptr[i] = data[i]; + } + } + } + } + + template + ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, + const Policy &arg_policy, const ViewType &arg_result_view) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result_view.data()) { + static_assert(Kokkos::is_view::value, + "Kokkos::Threads reduce result must be a View"); + + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::Threads reduce result must be a View accessible from " + "HostSpace"); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp similarity index 59% rename from core/src/Threads/Kokkos_Threads_Parallel_Team.hpp rename to core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index 28efba5ed2e..c4b6100a9df 100644 --- a/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -14,104 +14,14 @@ // //@HEADER -#ifndef KOKKOS_THREADS_PARALLEL_TEAM_HPP -#define KOKKOS_THREADS_PARALLEL_TEAM_HPP +#ifndef KOKKOS_THREADS_PARALLEL_REDUCE_TEAM_HPP +#define KOKKOS_THREADS_PARALLEL_REDUCE_TEAM_HPP #include namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Threads> { - private: - using Policy = - Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - - const FunctorType m_functor; - const Policy m_policy; - const size_t m_shared; - - template - inline static std::enable_if_t::value && - std::is_same::value> - exec_team(const FunctorType &functor, Member member) { - for (; member.valid_static(); member.next_static()) { - functor(member); - } - } - - template - inline static std::enable_if_t::value && - std::is_same::value> - exec_team(const FunctorType &functor, Member member) { - const TagType t{}; - for (; member.valid_static(); member.next_static()) { - functor(t, member); - } - } - - template - inline static std::enable_if_t::value && - std::is_same::value> - exec_team(const FunctorType &functor, Member member) { - for (; member.valid_dynamic(); member.next_dynamic()) { - functor(member); - } - } - - template - inline static std::enable_if_t::value && - std::is_same::value> - exec_team(const FunctorType &functor, Member member) { - const TagType t{}; - for (; member.valid_dynamic(); member.next_dynamic()) { - functor(t, member); - } - } - - static void exec(ThreadsExec &exec, const void *arg) { - const ParallelFor &self = *((const ParallelFor *)arg); - - ParallelFor::exec_team( - self.m_functor, Member(&exec, self.m_policy, self.m_shared)); - - exec.barrier(); - exec.fan_in(); - } - template - Policy fix_policy(Policy policy) { - if (policy.impl_vector_length() < 0) { - policy.impl_set_vector_length(1); - } - if (policy.team_size() < 0) { - policy.impl_set_team_size( - policy.team_size_recommended(m_functor, ParallelForTag{})); - } - return policy; - } - - public: - inline void execute() const { - ThreadsExec::resize_scratch( - 0, Policy::member_type::team_reduce_size() + m_shared); - - ThreadsExec::start(&ParallelFor::exec, this); - - ThreadsExec::fence(); - } - - ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) - : m_functor(arg_functor), - m_policy(fix_policy(arg_policy)), - m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, m_policy.team_size())) {} -}; - template class ParallelReduce, Kokkos::Threads> { diff --git a/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp new file mode 100644 index 00000000000..74d8561a34b --- /dev/null +++ b/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp @@ -0,0 +1,198 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_THREADS_PARALLEL_SCAN_RANGE_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelScan, + Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy; + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using Analysis = Impl::FunctorAnalysis; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member &ibeg, const Member &iend, + reference_type update, const bool final) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i, update, final); + } + } + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member &ibeg, const Member &iend, + reference_type update, const bool final) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i, update, final); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + const ParallelScan &self = *((const ParallelScan *)arg); + + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + typename Analysis::Reducer final_reducer(self.m_functor); + + reference_type update = + final_reducer.init(static_cast(exec.reduce_memory())); + + ParallelScan::template exec_range(self.m_functor, range.begin(), + range.end(), update, false); + + // exec.template scan_large( final_reducer ); + exec.scan_small(final_reducer); + + ParallelScan::template exec_range(self.m_functor, range.begin(), + range.end(), update, true); + + exec.fan_in(); + } + + public: + inline void execute() const { + ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsExec::start(&ParallelScan::exec, this); + ThreadsExec::fence(); + } + + ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +template +class ParallelScanWithTotal, + ReturnType, Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy; + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + using Analysis = Impl::FunctorAnalysis; + + using value_type = typename Analysis::value_type; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + const pointer_type m_result_ptr; + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member &ibeg, const Member &iend, + reference_type update, const bool final) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i, update, final); + } + } + + template + inline static std::enable_if_t::value> exec_range( + const FunctorType &functor, const Member &ibeg, const Member &iend, + reference_type update, const bool final) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i, update, final); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); + + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + typename Analysis::Reducer final_reducer(self.m_functor); + + reference_type update = + final_reducer.init(static_cast(exec.reduce_memory())); + + ParallelScanWithTotal::template exec_range( + self.m_functor, range.begin(), range.end(), update, false); + + // exec.template scan_large(final_reducer); + exec.scan_small(final_reducer); + + ParallelScanWithTotal::template exec_range( + self.m_functor, range.begin(), range.end(), update, true); + + exec.fan_in(); + + if (exec.pool_rank() == exec.pool_size() - 1) { + *self.m_result_ptr = update; + } + } + + public: + inline void execute() const { + ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsExec::start(&ParallelScanWithTotal::exec, this); + ThreadsExec::fence(); + } + + template + ParallelScanWithTotal(const FunctorType &arg_functor, + const Policy &arg_policy, + const ViewType &arg_result_view) + : m_functor(arg_functor), + m_policy(arg_policy), + m_result_ptr(arg_result_view.data()) { + static_assert( + Kokkos::Impl::MemorySpaceAccess::accessible, + "Kokkos::Threads parallel_scan result must be host-accessible!"); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp deleted file mode 100644 index f8c2867739b..00000000000 --- a/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp +++ /dev/null @@ -1,435 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_THREADS_PARALLEL_RANGE_HPP -#define KOKKOS_THREADS_PARALLEL_RANGE_HPP - -#include - -namespace Kokkos { -namespace Impl { - -template -class ParallelFor, - Kokkos::Threads> { - private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - - const FunctorType m_functor; - const Policy m_policy; - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member ibeg, const Member iend) { -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(i); - } - } - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member ibeg, const Member iend) { - const TagType t{}; -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(t, i); - } - } - - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); - } - - template - static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { - const ParallelFor &self = *((const ParallelFor *)arg); - - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - - ParallelFor::template exec_range(self.m_functor, range.begin(), - range.end()); - - exec.fan_in(); - } - - template - static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { - const ParallelFor &self = *((const ParallelFor *)arg); - - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); - - long work_index = exec.get_work_index(); - - while (work_index != -1) { - const Member begin = - static_cast(work_index) * self.m_policy.chunk_size() + - self.m_policy.begin(); - const Member end = - begin + self.m_policy.chunk_size() < self.m_policy.end() - ? begin + self.m_policy.chunk_size() - : self.m_policy.end(); - ParallelFor::template exec_range(self.m_functor, begin, end); - work_index = exec.get_work_index(); - } - - exec.fan_in(); - } - - public: - inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); - } - - ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -template -class ParallelReduce, - Kokkos::Threads> { - private: - using Policy = Kokkos::RangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member &ibeg, const Member &iend, - reference_type update) { -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(i, update); - } - } - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member &ibeg, const Member &iend, - reference_type update) { - const TagType t{}; -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(t, i, update); - } - } - - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); - } - - template - static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { - const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - - const ReducerType &reducer = self.m_functor_reducer.get_reducer(); - - ParallelReduce::template exec_range( - self.m_functor_reducer.get_functor(), range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); - - exec.fan_in_reduce(reducer); - } - - template - static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { - const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); - - long work_index = exec.get_work_index(); - const ReducerType &reducer = self.m_functor_reducer.get_reducer(); - - reference_type update = - reducer.init(static_cast(exec.reduce_memory())); - while (work_index != -1) { - const Member begin = - static_cast(work_index) * self.m_policy.chunk_size() + - self.m_policy.begin(); - const Member end = - begin + self.m_policy.chunk_size() < self.m_policy.end() - ? begin + self.m_policy.chunk_size() - : self.m_policy.end(); - ParallelReduce::template exec_range( - self.m_functor_reducer.get_functor(), begin, end, update); - work_index = exec.get_work_index(); - } - - exec.fan_in_reduce(reducer); - } - - public: - inline void execute() const { - const ReducerType &reducer = m_functor_reducer.get_reducer(); - - if (m_policy.end() <= m_policy.begin()) { - if (m_result_ptr) { - reducer.init(m_result_ptr); - reducer.final(m_result_ptr); - } - } else { - ThreadsExec::resize_scratch(reducer.value_size(), 0); - - ThreadsExec::start(&ParallelReduce::exec, this); - - ThreadsExec::fence(); - - if (m_result_ptr) { - const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); - - const unsigned n = reducer.value_count(); - for (unsigned i = 0; i < n; ++i) { - m_result_ptr[i] = data[i]; - } - } - } - } - - template - ParallelReduce(const CombinedFunctorReducerType &arg_functor_reducer, - const Policy &arg_policy, const ViewType &arg_result_view) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result_view.data()) { - static_assert(Kokkos::is_view::value, - "Kokkos::Threads reduce result must be a View"); - - static_assert( - Kokkos::Impl::MemorySpaceAccess::accessible, - "Kokkos::Threads reduce result must be a View accessible from " - "HostSpace"); - } -}; - -template -class ParallelScan, - Kokkos::Threads> { - private: - using Policy = Kokkos::RangePolicy; - using WorkRange = typename Policy::WorkRange; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Analysis = Impl::FunctorAnalysis; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - const FunctorType m_functor; - const Policy m_policy; - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member &ibeg, const Member &iend, - reference_type update, const bool final) { -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(i, update, final); - } - } - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member &ibeg, const Member &iend, - reference_type update, const bool final) { - const TagType t{}; -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(t, i, update, final); - } - } - - static void exec(ThreadsExec &exec, const void *arg) { - const ParallelScan &self = *((const ParallelScan *)arg); - - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - - typename Analysis::Reducer final_reducer(self.m_functor); - - reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); - - ParallelScan::template exec_range(self.m_functor, range.begin(), - range.end(), update, false); - - // exec.template scan_large( final_reducer ); - exec.scan_small(final_reducer); - - ParallelScan::template exec_range(self.m_functor, range.begin(), - range.end(), update, true); - - exec.fan_in(); - } - - public: - inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScan::exec, this); - ThreadsExec::fence(); - } - - ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -template -class ParallelScanWithTotal, - ReturnType, Kokkos::Threads> { - private: - using Policy = Kokkos::RangePolicy; - using WorkRange = typename Policy::WorkRange; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - - using Analysis = Impl::FunctorAnalysis; - - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - const FunctorType m_functor; - const Policy m_policy; - const pointer_type m_result_ptr; - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member &ibeg, const Member &iend, - reference_type update, const bool final) { -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(i, update, final); - } - } - - template - inline static std::enable_if_t::value> exec_range( - const FunctorType &functor, const Member &ibeg, const Member &iend, - reference_type update, const bool final) { - const TagType t{}; -#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ - defined(KOKKOS_ENABLE_PRAGMA_IVDEP) -#pragma ivdep -#endif - for (Member i = ibeg; i < iend; ++i) { - functor(t, i, update, final); - } - } - - static void exec(ThreadsExec &exec, const void *arg) { - const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); - - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); - - typename Analysis::Reducer final_reducer(self.m_functor); - - reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); - - ParallelScanWithTotal::template exec_range( - self.m_functor, range.begin(), range.end(), update, false); - - // exec.template scan_large(final_reducer); - exec.scan_small(final_reducer); - - ParallelScanWithTotal::template exec_range( - self.m_functor, range.begin(), range.end(), update, true); - - exec.fan_in(); - - if (exec.pool_rank() == exec.pool_size() - 1) { - *self.m_result_ptr = update; - } - } - - public: - inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScanWithTotal::exec, this); - ThreadsExec::fence(); - } - - template - ParallelScanWithTotal(const FunctorType &arg_functor, - const Policy &arg_policy, - const ViewType &arg_result_view) - : m_functor(arg_functor), - m_policy(arg_policy), - m_result_ptr(arg_result_view.data()) { - static_assert( - Kokkos::Impl::MemorySpaceAccess::accessible, - "Kokkos::Threads parallel_scan result must be host-accessible!"); - } -}; - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index 797044b117d..d4ce697548f 100644 --- a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP #include -#include +#include namespace Kokkos { namespace Impl { diff --git a/core/src/decl/Kokkos_Declare_THREADS.hpp b/core/src/decl/Kokkos_Declare_THREADS.hpp index 16f134b6f28..f5cbc0c1d1d 100644 --- a/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -19,7 +19,17 @@ #if defined(KOKKOS_ENABLE_THREADS) #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #endif #endif From 23496b47e9595a154beab1118ead98615430895a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 9 Oct 2023 13:23:33 -0400 Subject: [PATCH 047/432] HPX: Implement TeamThread and ThreadVector parallel_scan with return value --- core/src/HPX/Kokkos_HPX.hpp | 45 ++++++++++++++++++++++++++++--- core/unit_test/TestTeamScan.hpp | 3 +-- core/unit_test/TestTeamVector.hpp | 6 ++--- 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index d668b4effb0..1dfc5b40646 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -1828,16 +1828,18 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } } -template +template KOKKOS_INLINE_FUNCTION void parallel_scan( Impl::TeamThreadRangeBoundariesStruct const &loop_boundaries, - const FunctorType &lambda) { - using value_type = typename Kokkos::Impl::FunctorAnalysis< + const FunctorType &lambda, ValueType &return_val) { + using functor_value_type = typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, void>::value_type; + static_assert(std::is_same_v, + "Non-matching value types of functor and return type"); - value_type scan_val = value_type(); + ValueType scan_val{}; // Intra-member scan for (iType i = loop_boundaries.start; i < loop_boundaries.end; @@ -1852,6 +1854,22 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( i += loop_boundaries.increment) { lambda(i, scan_val, true); } + + return_val = scan_val; +} + +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct + &loop_bounds, + const FunctorType &lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; + + value_type scan_val; + parallel_scan(loop_bounds, lambda, scan_val); } /** \brief Intra-thread vector parallel exclusive prefix sum. Executes @@ -1905,6 +1923,25 @@ parallel_scan( i += loop_boundaries.increment) { lambda(i, scan_val, true); } + reducer.reference() = scan_val; +} + +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct + &loop_boundaries, + const FunctorType &lambda, ValueType &return_val) { + // Extract ValueType from FunctorType + using closure_value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, FunctorType, + void>::value_type; + static_assert(std::is_same::value, + "Non-matching value types of closure and return type"); + + ValueType accum; + parallel_scan(loop_boundaries, lambda, Kokkos::Sum(accum)); + + return_val = accum; } template diff --git a/core/unit_test/TestTeamScan.hpp b/core/unit_test/TestTeamScan.hpp index 401a56ac3a2..8f67fb01a5f 100644 --- a/core/unit_test/TestTeamScan.hpp +++ b/core/unit_test/TestTeamScan.hpp @@ -132,8 +132,7 @@ TEST(TEST_CATEGORY, team_scan) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ - !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) template struct TestTeamScanRetVal { using execution_space = ExecutionSpace; diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index c6fa182938e..6cf2208cb52 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -607,8 +607,7 @@ struct functor_vec_scan { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ - !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) template struct functor_vec_scan_ret_val { using policy_type = Kokkos::TeamPolicy; @@ -735,8 +734,7 @@ bool test_scalar(int nteams, int team_size, int test) { } else if (test == 12) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ - !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) Kokkos::parallel_for( Kokkos::TeamPolicy(nteams, team_size, 8), functor_vec_scan_ret_val(d_flag, team_size)); From ebef19bdf226761884ac0aef3996804d525cecc1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 9 Oct 2023 22:27:28 -0400 Subject: [PATCH 048/432] Serial: Allow for distinct execution space instances (#6441) * Serial: Allow for distinct execution space instances * Remove redundant test * Make default constructor for NewInstance explicit * Shorten partition_space implementation. Co-authored-by: Damien L-G * Fix unused function warning --------- Co-authored-by: Damien L-G --- core/src/Serial/Kokkos_Serial.cpp | 6 ++ core/src/Serial/Kokkos_Serial.hpp | 38 ++++++++++++ core/unit_test/TestExecSpacePartitioning.hpp | 61 +++++++------------- 3 files changed, 66 insertions(+), 39 deletions(-) diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index 6d55dffb7ce..e81e8349391 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -145,6 +145,12 @@ Serial::Serial() : m_space_instance(&Impl::SerialInternal::singleton(), [](Impl::SerialInternal*) {}) {} +Serial::Serial(NewInstance) + : m_space_instance(new Impl::SerialInternal, [](Impl::SerialInternal* ptr) { + ptr->finalize(); + delete ptr; + }) {} + void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; diff --git a/core/src/Serial/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp index 2ade37705ea..db1567610b2 100644 --- a/core/src/Serial/Kokkos_Serial.hpp +++ b/core/src/Serial/Kokkos_Serial.hpp @@ -72,6 +72,10 @@ class SerialInternal { }; } // namespace Impl +struct NewInstance { + explicit NewInstance() = default; +}; + /// \class Serial /// \brief Kokkos device for non-parallel execution /// @@ -108,6 +112,8 @@ class Serial { Serial(); + Serial(NewInstance); + /// \brief True if and only if this method is being called in a /// thread-parallel function. /// @@ -218,6 +224,38 @@ struct MemorySpaceAccess +std::vector partition_space(const Serial&, Args...) { + static_assert( + (... && std::is_arithmetic_v), + "Kokkos Error: partitioning arguments must be integers or floats"); + std::vector instances; + instances.reserve(sizeof...(Args)); + std::generate_n(std::back_inserter(instances), sizeof...(Args), + []() { return Serial{NewInstance{}}; }); + return instances; +} + +template +std::vector partition_space(const Serial&, + std::vector const& weights) { + static_assert( + std::is_arithmetic::value, + "Kokkos Error: partitioning arguments must be integers or floats"); + + // We only care about the number of instances to create and ignore weights + // otherwise. + std::vector instances; + instances.reserve(weights.size()); + std::generate_n(std::back_inserter(instances), weights.size(), + []() { return Serial{NewInstance{}}; }); + return instances; +} + +} // namespace Kokkos::Experimental + #include #include #include diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 1a70b7b2676..65314d6be7c 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -29,30 +29,35 @@ struct SumFunctor { }; template -void check_distinctive(ExecSpace, ExecSpace) {} - +void check_distinctive([[maybe_unused]] ExecSpace exec1, + [[maybe_unused]] ExecSpace exec2) { +#ifdef KOKKOS_ENABLE_SERIAL + if constexpr (std::is_same_v) { + ASSERT_NE(exec1, exec2); + } +#endif +#ifdef KOKKOS_ENABLE_OPENMP + if constexpr (std::is_same_v) { + ASSERT_NE(exec1, exec2); + } +#endif #ifdef KOKKOS_ENABLE_CUDA -void check_distinctive(Kokkos::Cuda exec1, Kokkos::Cuda exec2) { - ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream()); -} + if constexpr (std::is_same_v) { + ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream()); + } #endif #ifdef KOKKOS_ENABLE_HIP -void check_distinctive(Kokkos::HIP exec1, Kokkos::HIP exec2) { - ASSERT_NE(exec1.hip_stream(), exec2.hip_stream()); -} + if constexpr (std::is_same_v) { + ASSERT_NE(exec1.hip_stream(), exec2.hip_stream()); + } #endif #ifdef KOKKOS_ENABLE_SYCL -void check_distinctive(Kokkos::Experimental::SYCL exec1, - Kokkos::Experimental::SYCL exec2) { - ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue, - *exec2.impl_internal_space_instance()->m_queue); -} + if constexpr (std::is_same_v) { + ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue, + *exec2.impl_internal_space_instance()->m_queue); + } #endif -#ifdef KOKKOS_ENABLE_OPENMP -void check_distinctive(Kokkos::OpenMP exec1, Kokkos::OpenMP exec2) { - ASSERT_NE(exec1, exec2); } -#endif } // namespace #ifdef KOKKOS_ENABLE_OPENMP @@ -99,28 +104,6 @@ void test_partitioning(std::vector& instances) { }); ASSERT_EQ(sum1, sum2); ASSERT_EQ(sum1, N * (N - 1) / 2); - -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMP) - // Eliminate unused function warning - // (i.e. when compiling for Serial and CUDA, during Serial compilation the - // Cuda overload is unused ...) - if (sum1 != sum2) { -#ifdef KOKKOS_ENABLE_CUDA - check_distinctive(Kokkos::Cuda(), Kokkos::Cuda()); -#endif -#ifdef KOKKOS_ENABLE_HIP - check_distinctive(Kokkos::HIP(), Kokkos::HIP()); -#endif -#ifdef KOKKOS_ENABLE_SYCL - check_distinctive(Kokkos::Experimental::SYCL(), - Kokkos::Experimental::SYCL()); -#endif -#ifdef KOKKOS_ENABLE_OPENMP - check_distinctive(Kokkos::OpenMP(), Kokkos::OpenMP()); -#endif - } -#endif } TEST(TEST_CATEGORY, partitioning_by_args) { From fdfeaf916d3b673f1237099265f038b42d2bbae2 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Tue, 10 Oct 2023 08:02:05 +0200 Subject: [PATCH 049/432] add overload for TeamThreadRange --- .../Kokkos_OpenMPTarget_ParallelScan_Team.hpp | 33 +++++++++++++++---- core/unit_test/TestTeamScan.hpp | 3 +- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp index a9e24994e0d..05f2e4db4cf 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp @@ -32,15 +32,17 @@ namespace Kokkos { // This is largely the same code as in HIP and CUDA except for the member name -template +template KOKKOS_INLINE_FUNCTION void parallel_scan( const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, FunctorType, void>; - using value_type = typename Analysis::value_type; + using analysis_value_type = typename Analysis::value_type; + static_assert(std::is_same_v, + "Non-matching value types of functor and return type"); const auto start = loop_bounds.start; const auto end = loop_bounds.end; @@ -50,24 +52,27 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const auto team_rank = member.team_rank(); #if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND) - value_type scan_val = value_type(); + ValueType scan_val = {}; if (team_rank == 0) { for (iType i = start; i < end; ++i) { lambda(i, scan_val, true); } } + member.team_broadcast(scan_val, 0); + return_val = scan_val; + #pragma omp barrier #else const auto team_size = member.team_size(); const auto nchunk = (end - start + team_size - 1) / team_size; - value_type accum = 0; + ValueType accum = {}; // each team has to process one or // more chunks of the prefix scan for (iType i = 0; i < nchunk; ++i) { auto ii = start + i * team_size + team_rank; // local accumulation for this chunk - value_type local_accum = 0; + ValueType local_accum = {}; // user updates value with prefix value if (ii < loop_bounds.end) lambda(ii, local_accum, false); // perform team scan @@ -81,9 +86,23 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( // broadcast last value to rest of the team member.team_broadcast(accum, team_size - 1); } + return_val = accum; + #endif } +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, + const FunctorType& lambda) { + using Analysis = Impl::FunctorAnalysis, + FunctorType, void>; + using value_type = typename Analysis::value_type; + value_type scan_val; + parallel_scan(loop_bounds, lambda, scan_val); +} } // namespace Kokkos namespace Kokkos { diff --git a/core/unit_test/TestTeamScan.hpp b/core/unit_test/TestTeamScan.hpp index 401a56ac3a2..bcb8fcdf6c0 100644 --- a/core/unit_test/TestTeamScan.hpp +++ b/core/unit_test/TestTeamScan.hpp @@ -132,8 +132,7 @@ TEST(TEST_CATEGORY, team_scan) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \ - !defined(KOKKOS_ENABLE_HPX) +#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_HPX) template struct TestTeamScanRetVal { using execution_space = ExecutionSpace; From 5b693fd954aa24c8dda6bef128e38e844500a77e Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Tue, 10 Oct 2023 08:05:13 +0200 Subject: [PATCH 050/432] address review comment --- algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp | 4 ++-- .../impl/Kokkos_MustUseKokkosSingleInTeam.hpp | 9 +++------ algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp | 4 ++-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp index 79f342e81ba..dbf637b2c91 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp @@ -124,8 +124,8 @@ KOKKOS_FUNCTION OutputIterator copy_if_team_impl( } const std::size_t num_elements = Kokkos::Experimental::distance(first, last); - if constexpr (stdalgo_must_use_kokkos_single_for_team_scan< - typename TeamHandleType::execution_space>::value) { + if constexpr (stdalgo_must_use_kokkos_single_for_team_scan_v< + typename TeamHandleType::execution_space>) { std::size_t count = 0; Kokkos::single( Kokkos::PerTeam(teamHandle), diff --git a/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp b/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp index 62e2636f2ac..e45b2069154 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp @@ -50,12 +50,9 @@ struct stdalgo_must_use_kokkos_single_for_team_scan : std::true_type {}; #endif -// FIXME_THREADS -#if defined(KOKKOS_ENABLE_THREADS) -template <> -struct stdalgo_must_use_kokkos_single_for_team_scan - : std::true_type {}; -#endif +template +inline constexpr bool stdalgo_must_use_kokkos_single_for_team_scan_v = + stdalgo_must_use_kokkos_single_for_team_scan::value; } // namespace Impl } // namespace Experimental diff --git a/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp index 5084f57b14e..2a82461e614 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp @@ -142,8 +142,8 @@ KOKKOS_FUNCTION OutputIterator unique_copy_team_impl( } else { - if constexpr (stdalgo_must_use_kokkos_single_for_team_scan< - typename TeamHandleType::execution_space>::value) { + if constexpr (stdalgo_must_use_kokkos_single_for_team_scan_v< + typename TeamHandleType::execution_space>) { std::size_t count = 0; Kokkos::single( Kokkos::PerTeam(teamHandle), From 8420c2f00660ac25de4ab4b9ede2fdf47125e291 Mon Sep 17 00:00:00 2001 From: IanBogle <107126280+IanBogle@users.noreply.github.com> Date: Tue, 10 Oct 2023 15:59:08 -0400 Subject: [PATCH 051/432] Update to HIP TeamPolicy Block number heuristic (#6284) * Update to TeamPolicy Block number heuristic to improve performance in mid-range regimes * Commented out UseShflReduction=false and related changes, each one has a nearby FIXME_HIP. Tests are passing, build succeeds. * Applied formatting patch * Removed commented out code for smaller size types * Added check for HintLightWeight_t, to allow more consistent performance on light weight kernels * Applied formatting patch * Removed outdated comment block * Another format patch * Move computation of block size to a function * Make compute_block_count const --------- Co-authored-by: Bruno Turcksin --- core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 38 ++++++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index b81ea7913ad..3fe568ac361 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -590,6 +590,9 @@ class ParallelReduce= block_max) { + block_count = block_max; + + } else { + int nwork = m_league_size * m_team_size; + int items_per_thread = + (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); + if (items_per_thread < 4) { + int ratio = std::min( + (block_count + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + block_count /= ratio; + } + } + + return block_count; + } + public: __device__ inline void operator()() const { int64_t threadid = 0; @@ -681,7 +713,6 @@ class ParallelReduce() + threadIdx.y * word_count.value); - // Iterate this block through the league iterate_through_league(threadid, value); @@ -757,10 +788,7 @@ class ParallelReduce::value; if (!is_empty_range || need_device_set) { - const int block_count = - UseShflReduction - ? std::min(m_league_size, size_type(1024 * HIPTraits::WarpSize)) - : std::min(static_cast(m_league_size), m_team_size); + int const block_count = compute_block_count(); m_scratch_space = hip_internal_scratch_space( m_policy.space(), reducer.value_size() * block_count); From f511dca95dd0c065e1de95d190c630a3e9c95602 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 11 Oct 2023 10:49:06 -0400 Subject: [PATCH 052/432] SIMD: Split math functions from SIMD_Common.hpp (#6487) * SIMD: Split math functions from SIMD_Common.hpp * Make Kokkos_SIMD_Common_Math.hpp self-contained * Enforce header inclusion order * Clean up comment in Kokkos_SIMD_Common_Math.hpp --- simd/src/Kokkos_SIMD.hpp | 2 + simd/src/Kokkos_SIMD_AVX2.hpp | 5 + simd/src/Kokkos_SIMD_AVX512.hpp | 5 + simd/src/Kokkos_SIMD_Common.hpp | 224 ----------------- simd/src/Kokkos_SIMD_Common_Math.hpp | 260 ++++++++++++++++++++ simd/src/Kokkos_SIMD_NEON.hpp | 5 + simd/src/Kokkos_SIMD_Scalar.hpp | 5 + simd/unit_tests/include/SIMDTesting_Ops.hpp | 2 +- 8 files changed, 283 insertions(+), 225 deletions(-) create mode 100644 simd/src/Kokkos_SIMD_Common_Math.hpp diff --git a/simd/src/Kokkos_SIMD.hpp b/simd/src/Kokkos_SIMD.hpp index 2bb551fdbfe..57d4afd88be 100644 --- a/simd/src/Kokkos_SIMD.hpp +++ b/simd/src/Kokkos_SIMD.hpp @@ -74,6 +74,8 @@ #pragma GCC diagnostic pop #endif +#include + namespace Kokkos { namespace Experimental { diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index cf9894ed2c4..82f284d513e 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -25,6 +25,11 @@ #include +#ifdef KOKKOS_SIMD_COMMON_MATH_HPP +#error \ + "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" +#endif + // FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here. #if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \ ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7)) diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index 66b922b2271..40a3bf375b4 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -25,6 +25,11 @@ #include +#ifdef KOKKOS_SIMD_COMMON_MATH_HPP +#error \ + "Kokkos_SIMD_AVX512.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" +#endif + namespace Kokkos { namespace Experimental { diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index cb0879338e4..a54ce37a6c4 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -17,7 +17,6 @@ #ifndef KOKKOS_SIMD_COMMON_HPP #define KOKKOS_SIMD_COMMON_HPP -#include #include #include @@ -329,230 +328,7 @@ template return a == simd_mask(false); } -template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T -hmin(const_where_expression, simd> const& x) { - auto const& v = x.impl_get_value(); - auto const& m = x.impl_get_mask(); - auto result = Kokkos::reduction_identity::min(); - for (std::size_t i = 0; i < v.size(); ++i) { - if (m[i]) result = Kokkos::min(result, v[i]); - } - return result; -} - -template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T -hmax(const_where_expression, simd> const& x) { - auto const& v = x.impl_get_value(); - auto const& m = x.impl_get_mask(); - auto result = Kokkos::reduction_identity::max(); - for (std::size_t i = 0; i < v.size(); ++i) { - if (m[i]) result = Kokkos::max(result, v[i]); - } - return result; -} - -template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T -reduce(const_where_expression, simd> const& x, T, - std::plus<>) { - auto const& v = x.impl_get_value(); - auto const& m = x.impl_get_mask(); - auto result = Kokkos::reduction_identity::sum(); - for (std::size_t i = 0; i < v.size(); ++i) { - if (m[i]) result += v[i]; - } - return result; -} - -} // namespace Experimental - -template -[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd min( - Experimental::simd const& a, Experimental::simd const& b) { - Experimental::simd result; - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { - result[i] = Kokkos::min(a[i], b[i]); - } - return result; -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Experimental { -template -[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd - min(Experimental::simd const& a, - Experimental::simd const& b) { - return Kokkos::min(a, b); -} -} // namespace Experimental -#endif - -template -[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd max( - Experimental::simd const& a, Experimental::simd const& b) { - Experimental::simd result; - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { - result[i] = Kokkos::max(a[i], b[i]); - } - return result; -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Experimental { -template -[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd - max(Experimental::simd const& a, - Experimental::simd const& b) { - return Kokkos::max(a, b); -} } // namespace Experimental -#endif - -// fallback implementations of functions. -// individual Abi types may provide overloads with more efficient -// implementations. -// These are not in the Experimental namespace because their double -// overloads are not either - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ - Experimental::simd const& a) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ - result[i] = Kokkos::FUNC(a[i]); \ - } \ - return result; \ - } \ - namespace Experimental { \ - template \ - [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ - simd \ - FUNC(simd const& a) { \ - return Kokkos::FUNC(a); \ - } \ - } -#else -#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ - Experimental::simd const& a) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ - result[i] = Kokkos::FUNC(a[i]); \ - } \ - return result; \ - } -#endif - -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(abs) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp2) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log10) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log2) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sqrt) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cbrt) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sin) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cos) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tan) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asin) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acos) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atan) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sinh) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cosh) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tanh) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asinh) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acosh) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atanh) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erf) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erfc) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tgamma) -KOKKOS_IMPL_SIMD_UNARY_FUNCTION(lgamma) - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ - Experimental::simd const& a, \ - Experimental::simd const& b) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ - result[i] = Kokkos::FUNC(a[i], b[i]); \ - } \ - return result; \ - } \ - namespace Experimental { \ - template \ - [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ - simd \ - FUNC(simd const& a, simd const& b) { \ - Kokkos::FUNC(a, b); \ - } \ - } -#else -#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ - Experimental::simd const& a, \ - Experimental::simd const& b) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ - result[i] = Kokkos::FUNC(a[i], b[i]); \ - } \ - return result; \ - } -#endif - -KOKKOS_IMPL_SIMD_BINARY_FUNCTION(pow) -KOKKOS_IMPL_SIMD_BINARY_FUNCTION(hypot) -KOKKOS_IMPL_SIMD_BINARY_FUNCTION(atan2) -KOKKOS_IMPL_SIMD_BINARY_FUNCTION(copysign) - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ - Experimental::simd const& a, \ - Experimental::simd const& b, \ - Experimental::simd const& c) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ - result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ - } \ - return result; \ - } \ - namespace Experimental { \ - template \ - [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ - simd \ - FUNC(simd const& a, simd const& b, \ - simd const& c) { \ - return Kokkos::FUNC(a, b, c); \ - } \ - } -#else -#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ - template \ - [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ - Experimental::simd const& a, \ - Experimental::simd const& b, \ - Experimental::simd const& c) { \ - Experimental::simd result; \ - for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ - result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ - } \ - return result; \ - } -#endif - -KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(fma) -KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(hypot) } // namespace Kokkos diff --git a/simd/src/Kokkos_SIMD_Common_Math.hpp b/simd/src/Kokkos_SIMD_Common_Math.hpp new file mode 100644 index 00000000000..8c6a9559604 --- /dev/null +++ b/simd/src/Kokkos_SIMD_Common_Math.hpp @@ -0,0 +1,260 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SIMD_COMMON_MATH_HPP +#define KOKKOS_SIMD_COMMON_MATH_HPP + +#include // Kokkos::min, etc. + +namespace Kokkos { + +namespace Experimental { + +template +class simd; + +template +class simd_mask; + +template +class const_where_expression; + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T +hmin(const_where_expression, simd> const& x) { + auto const& v = x.impl_get_value(); + auto const& m = x.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; +} + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T +hmax(const_where_expression, simd> const& x) { + auto const& v = x.impl_get_value(); + auto const& m = x.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; +} + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T +reduce(const_where_expression, simd> const& x, T, + std::plus<>) { + auto const& v = x.impl_get_value(); + auto const& m = x.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; +} + +} // namespace Experimental + +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd min( + Experimental::simd const& a, Experimental::simd const& b) { + Experimental::simd result; + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { + result[i] = Kokkos::min(a[i], b[i]); + } + return result; +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Experimental { +template +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd + min(Experimental::simd const& a, + Experimental::simd const& b) { + return Kokkos::min(a, b); +} +} // namespace Experimental +#endif + +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd max( + Experimental::simd const& a, Experimental::simd const& b) { + Experimental::simd result; + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { + result[i] = Kokkos::max(a[i], b[i]); + } + return result; +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Experimental { +template +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd + max(Experimental::simd const& a, + Experimental::simd const& b) { + return Kokkos::max(a, b); +} +} // namespace Experimental +#endif + +// fallback implementations of functions. +// individual Abi types may provide overloads with more efficient +// implementations. + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i]); \ + } \ + return result; \ + } \ + namespace Experimental { \ + template \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ + simd \ + FUNC(simd const& a) { \ + return Kokkos::FUNC(a); \ + } \ + } +#else +#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i]); \ + } \ + return result; \ + } +#endif + +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(abs) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp2) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log10) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log2) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sqrt) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cbrt) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sin) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cos) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tan) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asin) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acos) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atan) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sinh) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cosh) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tanh) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asinh) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acosh) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atanh) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erf) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erfc) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tgamma) +KOKKOS_IMPL_SIMD_UNARY_FUNCTION(lgamma) + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i]); \ + } \ + return result; \ + } \ + namespace Experimental { \ + template \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ + simd \ + FUNC(simd const& a, simd const& b) { \ + Kokkos::FUNC(a, b); \ + } \ + } +#else +#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i]); \ + } \ + return result; \ + } +#endif + +KOKKOS_IMPL_SIMD_BINARY_FUNCTION(pow) +KOKKOS_IMPL_SIMD_BINARY_FUNCTION(hypot) +KOKKOS_IMPL_SIMD_BINARY_FUNCTION(atan2) +KOKKOS_IMPL_SIMD_BINARY_FUNCTION(copysign) + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b, \ + Experimental::simd const& c) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ + } \ + return result; \ + } \ + namespace Experimental { \ + template \ + [[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ + simd \ + FUNC(simd const& a, simd const& b, \ + simd const& c) { \ + return Kokkos::FUNC(a, b, c); \ + } \ + } +#else +#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC) \ + template \ + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd FUNC( \ + Experimental::simd const& a, \ + Experimental::simd const& b, \ + Experimental::simd const& c) { \ + Experimental::simd result; \ + for (std::size_t i = 0; i < Experimental::simd::size(); ++i) { \ + result[i] = Kokkos::FUNC(a[i], b[i], c[i]); \ + } \ + return result; \ + } +#endif + +KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(fma) +KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(hypot) + +} // namespace Kokkos + +#endif diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index c3aca998b11..6d82294c46d 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -24,6 +24,11 @@ #include +#ifdef KOKKOS_SIMD_COMMON_MATH_HPP +#error \ + "Kokkos_SIMD_NEON.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" +#endif + namespace Kokkos { namespace Experimental { diff --git a/simd/src/Kokkos_SIMD_Scalar.hpp b/simd/src/Kokkos_SIMD_Scalar.hpp index bf6aeb9dd39..f10547e15d0 100644 --- a/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/simd/src/Kokkos_SIMD_Scalar.hpp @@ -23,6 +23,11 @@ #include +#ifdef KOKKOS_SIMD_COMMON_MATH_HPP +#error \ + "Kokkos_SIMD_Scalar.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" +#endif + namespace Kokkos { namespace Experimental { diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index 02af4f82eb8..60dff68f309 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -80,7 +80,7 @@ class absolutes { template auto on_host(T const& a) const { if constexpr (std::is_signed_v) { -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) && !defined(KOKKOS_COMPILER_NVCC) +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) return Kokkos::Experimental::abs(a); #else return Kokkos::abs(a); From a856f973e565a4ca3d8a425a2e71782138e215c1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 11 Oct 2023 09:17:04 -0400 Subject: [PATCH 053/432] Allow NVHPC as device compiler only with Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON --- .jenkins | 1 + cmake/kokkos_compiler_id.cmake | 9 ++++++--- cmake/kokkos_enable_options.cmake | 1 + cmake/kokkos_test_cxx_std.cmake | 8 ++++++-- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/.jenkins b/.jenkins index ce7ab284691..6f5cf80033f 100644 --- a/.jenkins +++ b/.jenkins @@ -86,6 +86,7 @@ pipeline { -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_OPENMP=ON \ -DKokkos_ENABLE_IMPL_MDSPAN=ON \ + -DKokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON \ .. && \ make -j8 && ctest --verbose''' } diff --git a/cmake/kokkos_compiler_id.cmake b/cmake/kokkos_compiler_id.cmake index fc4c06ce2b7..04589befc3a 100644 --- a/cmake/kokkos_compiler_id.cmake +++ b/cmake/kokkos_compiler_id.cmake @@ -42,10 +42,13 @@ IF(Kokkos_ENABLE_CUDA) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang + AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) IF(CMAKE_CXX_COMPILER_LAUNCHER) - MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or -clang++!") + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + MESSAGE(STATUS "Using nvc++ as device compiler requires Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON!") + ENDIF() + MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") ENDIF() # the first argument to launcher is always the C++ compiler defined by cmake # if the second argument matches the C++ compiler, it forwards the rest of the diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 720c670cd65..89e23b019bd 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -47,6 +47,7 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda # the past when UCX was used as MPI communication layer. We expect it is # resolved but we keep the option around a bit longer to be safe. KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major release 3 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) diff --git a/cmake/kokkos_test_cxx_std.cmake b/cmake/kokkos_test_cxx_std.cmake index 5f8e15cd673..7ad49fdd2d9 100644 --- a/cmake/kokkos_test_cxx_std.cmake +++ b/cmake/kokkos_test_cxx_std.cmake @@ -120,8 +120,12 @@ IF(KOKKOS_ENABLE_CUDA) ELSEIF(CMAKE_CXX_EXTENSIONS) MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or NVC++ or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") + ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. To allow nvc++ as Cuda compiler, Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON must be set!") + ELSE() + MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or NVC++ or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") + ENDIF() ENDIF() ENDIF() From 02e6bdcce3861db3bb1d69336ebb91c27d053211 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 12 Oct 2023 07:45:44 +0200 Subject: [PATCH 054/432] ad threadvector --- .../Kokkos_OpenMPTarget_ParallelScan_Team.hpp | 24 +++++++++++++++++++ core/unit_test/TestTeamVector.hpp | 4 ++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp index 05f2e4db4cf..094d2dba773 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp @@ -139,6 +139,30 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( } } +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const FunctorType& lambda, ValueType& return_val) { + using Analysis = Impl::FunctorAnalysis, + FunctorType, void>; + using analysis_value_type = typename Analysis::value_type; + static_assert(std::is_same_v, + "Non-matching value types of functor and return type"); + + ValueType scan_val = {}; + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { + lambda(i, scan_val, true); + } + + return_val = scan_val; +} + } // namespace Kokkos #ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 6cf2208cb52..39122736ed7 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -607,7 +607,7 @@ struct functor_vec_scan { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) +#if !defined(KOKKOS_ENABLE_OPENACC) template struct functor_vec_scan_ret_val { using policy_type = Kokkos::TeamPolicy; @@ -734,7 +734,7 @@ bool test_scalar(int nteams, int team_size, int test) { } else if (test == 12) { // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. -#if !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) +#if !defined(KOKKOS_ENABLE_OPENACC) Kokkos::parallel_for( Kokkos::TeamPolicy(nteams, team_size, 8), functor_vec_scan_ret_val(d_flag, team_size)); From 377b3f0576c7f09b46d1b05f04a64356c8043b2d Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 12 Oct 2023 07:48:40 +0200 Subject: [PATCH 055/432] fix order --- .../Kokkos_OpenMPTarget_ParallelScan_Team.hpp | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp index 094d2dba773..ad0b3c4e7f6 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp @@ -119,26 +119,6 @@ namespace Kokkos { * final==true. Scan_val will be set to the final sum value over all vector * lanes. */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, - FunctorType, void>; - using value_type = typename Analysis::value_type; - - value_type scan_val = value_type(); - -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif - for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { - lambda(i, scan_val, true); - } -} - template KOKKOS_INLINE_FUNCTION void parallel_scan( const Impl::ThreadVectorRangeBoundariesStruct< @@ -163,6 +143,20 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( return_val = scan_val; } +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const FunctorType& lambda) { + using Analysis = Impl::FunctorAnalysis, + FunctorType, void>; + using value_type = typename Analysis::value_type; + + value_type scan_val = value_type(); + parallel_scan(loop_boundaries, lambda, scan_val); +} + } // namespace Kokkos #ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND From adc885184d7d94ecacdb34080be074083d70e723 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 12 Oct 2023 09:12:07 +0200 Subject: [PATCH 056/432] remove guards --- .../impl/Kokkos_MustUseKokkosSingleInTeam.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp b/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp index e45b2069154..162c72c2db7 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp @@ -36,20 +36,6 @@ struct stdalgo_must_use_kokkos_single_for_team_scan< Kokkos::Experimental::OpenACC> : std::true_type {}; #endif -// FIXME_OPENMPTARGET -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -template <> -struct stdalgo_must_use_kokkos_single_for_team_scan< - Kokkos::Experimental::OpenMPTarget> : std::true_type {}; -#endif - -// FIXME_HPX -#if defined(KOKKOS_ENABLE_HPX) -template <> -struct stdalgo_must_use_kokkos_single_for_team_scan - : std::true_type {}; -#endif - template inline constexpr bool stdalgo_must_use_kokkos_single_for_team_scan_v = stdalgo_must_use_kokkos_single_for_team_scan::value; From 5544c0c2223795be09a4f53d232569990ffb3126 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Thu, 8 Jun 2023 16:30:57 +0000 Subject: [PATCH 057/432] UnorderedMap(space instance): proposal for #6067 --- containers/src/Kokkos_Bitset.hpp | 53 ++++++++++++-- containers/src/Kokkos_UnorderedMap.hpp | 84 ++++++++++++++++----- containers/unit_tests/TestUnorderedMap.hpp | 85 ++++++++++++++++++---- 3 files changed, 182 insertions(+), 40 deletions(-) diff --git a/containers/src/Kokkos_Bitset.hpp b/containers/src/Kokkos_Bitset.hpp index 35f691ecf4e..8be327b0492 100644 --- a/containers/src/Kokkos_Bitset.hpp +++ b/containers/src/Kokkos_Bitset.hpp @@ -28,6 +28,23 @@ namespace Kokkos { +namespace Impl { +//! Either append to the label if the property already exists, or set it. +template +auto with_updated_label(const ViewCtorProp& view_ctor_prop, + const std::string& label) { + //! If the label property is already set, append. Otherwise, set label. + if constexpr (ViewCtorProp::has_label) { + auto new_ctor_props(view_ctor_prop); + static_cast&>(new_ctor_props) + .value.append(label); + return new_ctor_props; + } else { + return Impl::with_properties_if_unset(view_ctor_prop, label); + } +} +} // namespace Impl + template class Bitset; @@ -70,13 +87,32 @@ class Bitset { block_shift = Kokkos::Impl::integral_power_of_two(block_size) }; + //! Type of @ref m_blocks. + using block_view_type = View>; + public: /// constructor /// arg_size := number of bit in set - Bitset(unsigned arg_size = 0u) - : m_size(arg_size), - m_last_block_mask(0u), - m_blocks("Bitset", ((m_size + block_mask) >> block_shift)) { + Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {} + + template + Bitset(const Impl::ViewCtorProp& arg_prop, unsigned arg_size) + : m_size(arg_size), m_last_block_mask(0u) { + //! Ensure that allocation properties are consistent. + using alloc_prop_t = std::decay_t; + static_assert(alloc_prop_t::initialize, + "Allocation property 'initialize' should be true."); + static_assert( + !alloc_prop_t::has_pointer, + "Allocation properties should not contain the 'pointer' property."); + + //! Update 'label' property and allocate. + const auto prop_copy = Kokkos::Impl::with_updated_label( + Impl::with_properties_if_unset(arg_prop, std::string("Bitset")), + " - blocks"); + m_blocks = + block_view_type(prop_copy, ((m_size + block_mask) >> block_shift)); + for (int i = 0, end = static_cast(m_size & block_mask); i < end; ++i) { m_last_block_mask |= 1u << i; } @@ -105,7 +141,7 @@ class Bitset { /// number of bits which are set to 1 /// can only be called from the host unsigned count() const { - Impl::BitsetCount > f(*this); + Impl::BitsetCount> f(*this); return f.apply(); } @@ -275,7 +311,7 @@ class Bitset { private: unsigned m_size; unsigned m_last_block_mask; - View > m_blocks; + block_view_type m_blocks; private: template @@ -302,6 +338,7 @@ class ConstBitset { public: using execution_space = typename Device::execution_space; using size_type = unsigned int; + using block_view_type = typename Bitset::block_view_type::const_type; private: enum { block_size = static_cast(sizeof(unsigned) * CHAR_BIT) }; @@ -340,7 +377,7 @@ class ConstBitset { unsigned size() const { return m_size; } unsigned count() const { - Impl::BitsetCount > f(*this); + Impl::BitsetCount> f(*this); return f.apply(); } @@ -356,7 +393,7 @@ class ConstBitset { private: unsigned m_size; - View > m_blocks; + block_view_type m_blocks; private: template diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index 75abaf02e4d..e001c062de3 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -34,6 +34,7 @@ #include #include +#include #include @@ -302,28 +303,76 @@ class UnorderedMap { /// keys are equal. UnorderedMap(size_type capacity_hint = 0, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type()) - : m_bounded_insert(true), - m_hasher(hasher), - m_equal_to(equal_to), - m_size("UnorderedMap size"), - m_available_indexes(calculate_capacity(capacity_hint)), - m_hash_lists(view_alloc(WithoutInitializing, "UnorderedMap hash list"), - Impl::find_hash_size(capacity())), - m_next_index(view_alloc(WithoutInitializing, "UnorderedMap next index"), - capacity() + 1) // +1 so that the *_at functions can - // always return a valid reference - , - m_keys("UnorderedMap keys", capacity()), - m_values("UnorderedMap values", (is_set ? 0 : capacity())), - m_scalars("UnorderedMap scalars") { + : UnorderedMap(Kokkos::view_alloc(), capacity_hint, hasher, equal_to) {} + + template + UnorderedMap(const Impl::ViewCtorProp &arg_prop, + size_type capacity_hint = 0, hasher_type hasher = hasher_type(), + equal_to_type equal_to = equal_to_type()) + : m_bounded_insert(true), m_hasher(hasher), m_equal_to(equal_to) { if (!is_insertable_map) { Kokkos::Impl::throw_runtime_exception( "Cannot construct a non-insertable (i.e. const key_type) " "unordered_map"); } - Kokkos::deep_copy(m_hash_lists, invalid_index); - Kokkos::deep_copy(m_next_index, invalid_index); + //! Ensure that allocation properties are consistent. + using alloc_prop_t = std::decay_t; + static_assert(alloc_prop_t::initialize, + "Allocation property 'initialize' should be true."); + static_assert( + !alloc_prop_t::has_pointer, + "Allocation properties should not contain the 'pointer' property."); + + /// Update allocation properties with 'label' and 'without initializing' + /// properties. + const auto prop_copy = + Impl::with_properties_if_unset(arg_prop, std::string("UnorderedMap")); + const auto prop_copy_noinit = + Impl::with_properties_if_unset(prop_copy, Kokkos::WithoutInitializing); + + //! Initialize member views. + m_size = shared_size_t(Kokkos::view_alloc( + Kokkos::DefaultHostExecutionSpace{}, + Impl::get_property(prop_copy) + " - size")); + + m_available_indexes = + bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"), + calculate_capacity(capacity_hint)); + + m_hash_lists = size_type_view( + Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"), + Impl::find_hash_size(capacity())); + + m_next_index = size_type_view( + Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"), + capacity() + 1); // +1 so that the *_at functions can always return a + // valid reference + + m_keys = key_type_view( + Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity()); + + m_values = value_type_view( + Kokkos::Impl::with_updated_label(prop_copy, " - values"), + is_set ? 0 : capacity()); + + m_scalars = + scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars")); + + /** + * Deep copies should also be done using the space instance if given. + * Instead of the if/else we could use the + * @c get_property_or_default, but giving even the default execution space + * instance will change the behavior of @c deep_copy. + */ + if constexpr (alloc_prop_t::has_execution_space) { + const auto &space = Impl::get_property(arg_prop); + Kokkos::deep_copy(space, m_hash_lists, invalid_index); + Kokkos::deep_copy(space, m_next_index, invalid_index); + } else { + Kokkos::deep_copy(m_hash_lists, invalid_index); + Kokkos::deep_copy(m_next_index, invalid_index); + } } void reset_failed_insert_flag() { reset_flag(failed_insert_idx); } @@ -860,7 +909,8 @@ class UnorderedMap { bool m_bounded_insert; hasher_type m_hasher; equal_to_type m_equal_to; - View m_size; + using shared_size_t = View; + shared_size_t m_size; bitset_type m_available_indexes; size_type_view m_hash_lists; size_type_view m_next_index; diff --git a/containers/unit_tests/TestUnorderedMap.hpp b/containers/unit_tests/TestUnorderedMap.hpp index 26c6bec7447..f63f1c6afe3 100644 --- a/containers/unit_tests/TestUnorderedMap.hpp +++ b/containers/unit_tests/TestUnorderedMap.hpp @@ -429,17 +429,57 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) { ASSERT_TRUE(n.is_allocated()); } +/** + * This helper is needed because NVCC does not like extended lambdas + * in private member functions. + * Google Test bodies are private member functions. So it is incompatible. + * See also https://github.com/google/googletest/issues/4104. + */ +template +struct UnorderedMapInsert { + //! Type of range-for policy and its index type. + using range_policy_t = + Kokkos::RangePolicy>; + using index_t = typename range_policy_t::index_type; + + const map_type m_map; + + //! Ensure shared ownership of @ref m_map. + UnorderedMapInsert(map_type map) : m_map(std::move(map)) {} + + //! Insert a single value. + template + void insert_single(const T &arg) const { + Kokkos::parallel_for( + Kokkos::RangePolicy(0, 1), + // NOLINTNEXTLINE(kokkos-implicit-this-capture) + KOKKOS_CLASS_LAMBDA(const index_t) { m_map.insert(arg); }); + } + + //! Insert multiple values. + template + void insert(Args &&... args) const { + static_assert(sizeof...(Args) > 1, "Prefer the single value version"); + constexpr size_t size = sizeof...(Args); + Kokkos::Array values{ + std::forward(args)...}; + Kokkos::parallel_for( + Kokkos::RangePolicy(0, size), + // NOLINTNEXTLINE(kokkos-implicit-this-capture) + KOKKOS_CLASS_LAMBDA(const index_t i) { m_map.insert(values[i]); }); + } +}; + TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) { - using Map = - Kokkos::UnorderedMap; + using map_type = Kokkos::UnorderedMap; + + map_type m(11); - Map m(11); ASSERT_EQ(0u, m.size()); - m.insert(2); - m.insert(3); - m.insert(5); - m.insert(7); + UnorderedMapInsert(m).insert(2, 3, 5, 7); + ASSERT_EQ(4u, m.size()); m.rehash(0); ASSERT_EQ(128u, m.capacity()); @@ -450,19 +490,22 @@ TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) { } TEST(TEST_CATEGORY, UnorderedMap_consistent_size) { - using Map = - Kokkos::UnorderedMap; + using map_type = Kokkos::UnorderedMap; + + map_type m(11); + UnorderedMapInsert inserter(m); + + inserter.insert_single(7); - Map m(11); - m.insert(7); - ; ASSERT_EQ(1u, m.size()); { - auto m2 = m; - m2.insert(2); + auto m_copy = m; + UnorderedMapInsert inserter_copy(m_copy); + inserter_copy.insert_single(2); // This line triggers modified flags to be cleared in both m and m2 - [[maybe_unused]] auto sz = m2.size(); + const auto sz = m_copy.size(); + ASSERT_EQ(2u, sz); } ASSERT_EQ(2u, m.size()); @@ -507,6 +550,18 @@ TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { } #endif +/** + * @test This test ensures that an @ref UnorderedMap can be built + * with an execution space instance (using @ref view_alloc). + */ +TEST(TEST_CATEGORY, UnorderedMap_constructor_view_alloc) { + using map_type = Kokkos::UnorderedMap; + map_type map(Kokkos::view_alloc(TEST_EXECSPACE{}, "test umap"), 150); + ASSERT_EQ(map.size(), 0u); + ASSERT_GE(map.capacity(), 150u); + ASSERT_TRUE(map.is_allocated()); +} + } // namespace Test #endif // KOKKOS_TEST_UNORDERED_MAP_HPP From 6ff5721a648d3b4c21ec47d504d85df090139da3 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 9 Oct 2023 15:05:57 -0400 Subject: [PATCH 058/432] Rename Kokkos_ThreadsExec to align with the other backends --- Makefile.targets | 4 +- core/src/Threads/Kokkos_Threads.hpp | 2 +- core/src/Threads/Kokkos_ThreadsTeam.hpp | 55 ++--- ...dsExec.cpp => Kokkos_Threads_Instance.cpp} | 199 +++++++++--------- ...dsExec.hpp => Kokkos_Threads_Instance.hpp} | 116 +++++----- .../Kokkos_Threads_ParallelFor_MDRange.hpp | 34 +-- .../Kokkos_Threads_ParallelFor_Range.hpp | 34 +-- .../Kokkos_Threads_ParallelFor_Team.hpp | 14 +- .../Kokkos_Threads_ParallelReduce_MDRange.hpp | 40 ++-- .../Kokkos_Threads_ParallelReduce_Range.hpp | 44 ++-- .../Kokkos_Threads_ParallelReduce_Team.hpp | 16 +- .../Kokkos_Threads_ParallelScan_Range.hpp | 38 ++-- .../Kokkos_Threads_WorkGraphPolicy.hpp | 11 +- core/src/decl/Kokkos_Declare_THREADS.hpp | 2 +- 14 files changed, 317 insertions(+), 292 deletions(-) rename core/src/Threads/{Kokkos_ThreadsExec.cpp => Kokkos_Threads_Instance.cpp} (79%) rename core/src/Threads/{Kokkos_ThreadsExec.hpp => Kokkos_Threads_Instance.hpp} (82%) diff --git a/Makefile.targets b/Makefile.targets index ec8770dd7de..0bd382f4670 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -82,8 +82,8 @@ Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) -Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) diff --git a/core/src/Threads/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp index c0d70c03ecb..c84ef0d405c 100644 --- a/core/src/Threads/Kokkos_Threads.hpp +++ b/core/src/Threads/Kokkos_Threads.hpp @@ -40,7 +40,7 @@ static_assert(false, namespace Kokkos { namespace Impl { -class ThreadsExec; +class ThreadsInternal; enum class fence_is_static { yes, no }; } // namespace Impl } // namespace Kokkos diff --git a/core/src/Threads/Kokkos_ThreadsTeam.hpp b/core/src/Threads/Kokkos_ThreadsTeam.hpp index b1cadc7c485..958f0053410 100644 --- a/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -50,8 +50,8 @@ class ThreadsExecTeamMember { private: using space = execution_space::scratch_memory_space; - ThreadsExec* const m_exec; - ThreadsExec* const* m_team_base; ///< Base for team fan-in + ThreadsInternal* const m_instance; + ThreadsInternal* const* m_team_base; ///< Base for team fan-in space m_team_shared; size_t m_team_shared_size; int m_team_size; @@ -85,13 +85,14 @@ class ThreadsExecTeamMember { (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { Impl::spinwait_while_equal(m_team_base[j]->state(), - ThreadsExec::Active); + ThreadsInternal::Active); } // If not root then wait for release if (m_team_rank_rev) { - m_exec->state() = ThreadsExec::Rendezvous; - Impl::spinwait_while_equal(m_exec->state(), ThreadsExec::Rendezvous); + m_instance->state() = ThreadsInternal::Rendezvous; + Impl::spinwait_while_equal(m_instance->state(), + ThreadsInternal::Rendezvous); } return !m_team_rank_rev; @@ -102,7 +103,7 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - m_team_base[j]->state() = ThreadsExec::Active; + m_team_base[j]->state() = ThreadsInternal::Active; } } @@ -188,10 +189,10 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return value; + if (m_instance == nullptr) return value; if (team_rank() != team_size() - 1) * - ((volatile type*)m_exec->scratch_memory()) = value; + ((volatile type*)m_instance->scratch_memory()) = value; memory_fence(); @@ -229,9 +230,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return; + if (m_instance == nullptr) return; - type* const local_value = ((type*)m_exec->scratch_memory()); + type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution if (team_rank() != team_size() - 1) { *local_value = contribution; } @@ -285,9 +286,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return type(0); + if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_exec->scratch_memory()); + volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -342,10 +343,10 @@ class ThreadsExecTeamMember { template ThreadsExecTeamMember( - Impl::ThreadsExec* exec, + Impl::ThreadsInternal* instance, const TeamPolicyInternal& team, const size_t shared_size) - : m_exec(exec), + : m_instance(instance), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(shared_size), @@ -361,9 +362,11 @@ class ThreadsExecTeamMember { if (team.league_size()) { // Execution is using device-team interface: - const int pool_rank_rev = m_exec->pool_size() - (m_exec->pool_rank() + 1); + const int pool_rank_rev = + m_instance->pool_size() - (m_instance->pool_rank() + 1); const int team_rank_rev = pool_rank_rev % team.team_alloc(); - const size_t pool_league_size = m_exec->pool_size() / team.team_alloc(); + const size_t pool_league_size = + m_instance->pool_size() / team.team_alloc(); const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc(); if (pool_league_rank_rev >= pool_league_size) { m_invalid_thread = 1; @@ -372,7 +375,7 @@ class ThreadsExecTeamMember { const size_t pool_league_rank = pool_league_size - (pool_league_rank_rev + 1); - const int pool_num_teams = m_exec->pool_size() / team.team_alloc(); + const int pool_num_teams = m_instance->pool_size() / team.team_alloc(); const int chunk_size = team.chunk_size() > 0 ? team.chunk_size() : team.team_iter(); const int chunks_per_team = @@ -387,8 +390,8 @@ class ThreadsExecTeamMember { if ((team.team_alloc() > size_t(m_team_size)) ? (team_rank_rev >= m_team_size) - : (m_exec->pool_size() - pool_num_teams * m_team_size > - m_exec->pool_rank())) + : (m_instance->pool_size() - pool_num_teams * m_team_size > + m_instance->pool_rank())) m_invalid_thread = 1; else m_invalid_thread = 0; @@ -398,7 +401,7 @@ class ThreadsExecTeamMember { if (team_rank_rev < team.team_size() && !m_invalid_thread) { m_team_base = - m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev; + m_instance->pool_base() + team.team_alloc() * pool_league_rank_rev; m_team_size = team.team_size(); m_team_rank = team.team_size() - (team_rank_rev + 1); m_team_rank_rev = team_rank_rev; @@ -413,13 +416,13 @@ class ThreadsExecTeamMember { } if ((m_team_rank_rev == 0) && (m_invalid_thread == 0)) { - m_exec->set_work_range(m_league_rank, m_league_end, m_chunk_size); - m_exec->reset_steal_target(m_team_size); + m_instance->set_work_range(m_league_rank, m_league_end, m_chunk_size); + m_instance->reset_steal_target(m_team_size); } if (std::is_same::schedule_type::type, Kokkos::Dynamic>::value) { - m_exec->barrier(); + m_instance->barrier(); } } else { m_invalid_thread = 1; @@ -427,7 +430,7 @@ class ThreadsExecTeamMember { } ThreadsExecTeamMember() - : m_exec(nullptr), + : m_instance(nullptr), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(0), @@ -442,8 +445,8 @@ class ThreadsExecTeamMember { m_invalid_thread(0), m_team_alloc(0) {} - inline ThreadsExec& threads_exec_team_base() const { - return m_team_base ? **m_team_base : *m_exec; + inline ThreadsInternal& threads_exec_team_base() const { + return m_team_base ? **m_team_base : *m_instance; } bool valid_static() const { return m_league_rank < m_league_end; } diff --git a/core/src/Threads/Kokkos_ThreadsExec.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp similarity index 79% rename from core/src/Threads/Kokkos_ThreadsExec.cpp rename to core/src/Threads/Kokkos_Threads_Instance.cpp index c754091e87e..ece6311fcc9 100644 --- a/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -16,6 +16,7 @@ #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE +#include "Threads/Kokkos_Threads_Instance.hpp" #endif #include @@ -49,7 +50,7 @@ std::mutex host_internal_cppthread_mutex; // abort the process. void internal_cppthread_driver() { try { - ThreadsExec::driver(); + ThreadsInternal::driver(); } catch (const std::exception &x) { std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl; @@ -62,17 +63,18 @@ void internal_cppthread_driver() { } } -ThreadsExec s_threads_process; -ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr}; -std::thread::id s_threads_pid[ThreadsExec::MAX_THREAD_COUNT]; -std::pair s_threads_coord[ThreadsExec::MAX_THREAD_COUNT]; +ThreadsInternal s_threads_process; +ThreadsInternal *s_threads_exec[ThreadsInternal::MAX_THREAD_COUNT] = {nullptr}; +std::thread::id s_threads_pid[ThreadsInternal::MAX_THREAD_COUNT]; +std::pair + s_threads_coord[ThreadsInternal::MAX_THREAD_COUNT]; int s_thread_pool_size[3] = {0, 0, 0}; unsigned s_current_reduce_size = 0; unsigned s_current_shared_size = 0; -void (*volatile s_current_function)(ThreadsExec &, const void *); +void (*volatile s_current_function)(ThreadsInternal &, const void *); const void *volatile s_current_function_arg = nullptr; struct Sentinel { @@ -110,49 +112,51 @@ namespace Impl { //---------------------------------------------------------------------------- // Spawn a thread -void ThreadsExec::spawn() { +void ThreadsInternal::spawn() { std::thread t(internal_cppthread_driver); t.detach(); } //---------------------------------------------------------------------------- -bool ThreadsExec::is_process() { +bool ThreadsInternal::is_process() { static const std::thread::id master_pid = std::this_thread::get_id(); return master_pid == std::this_thread::get_id(); } -void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); } +void ThreadsInternal::global_lock() { host_internal_cppthread_mutex.lock(); } -void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); } +void ThreadsInternal::global_unlock() { + host_internal_cppthread_mutex.unlock(); +} //---------------------------------------------------------------------------- -void ThreadsExec::wait_yield(volatile int &flag, const int value) { +void ThreadsInternal::wait_yield(volatile int &flag, const int value) { while (value == flag) { std::this_thread::yield(); } } -void execute_function_noop(ThreadsExec &, const void *) {} +void execute_function_noop(ThreadsInternal &, const void *) {} -void ThreadsExec::driver() { +void ThreadsInternal::driver() { SharedAllocationRecord::tracking_enable(); - ThreadsExec this_thread; + ThreadsInternal this_thread; - while (ThreadsExec::Active == this_thread.m_pool_state) { + while (this_thread.m_pool_state == ThreadsInternal::Active) { (*s_current_function)(this_thread, s_current_function_arg); // Deactivate thread and wait for reactivation - this_thread.m_pool_state = ThreadsExec::Inactive; + this_thread.m_pool_state = ThreadsInternal::Inactive; - wait_yield(this_thread.m_pool_state, ThreadsExec::Inactive); + wait_yield(this_thread.m_pool_state, ThreadsInternal::Inactive); } } -ThreadsExec::ThreadsExec() +ThreadsInternal::ThreadsInternal() : m_pool_base(nullptr), m_scratch(nullptr), m_scratch_reduce_end(0), @@ -162,11 +166,11 @@ ThreadsExec::ThreadsExec() m_pool_rank(0), m_pool_size(0), m_pool_fan_size(0), - m_pool_state(ThreadsExec::Terminating) { + m_pool_state(ThreadsInternal::Terminating) { if (&s_threads_process != this) { // A spawned thread - ThreadsExec *const nil = nullptr; + ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding const int entry = reinterpret_cast(s_current_function_arg) < @@ -188,27 +192,27 @@ ThreadsExec::ThreadsExec() m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); m_pool_size = s_thread_pool_size[0]; m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); - m_pool_state = ThreadsExec::Active; + m_pool_state = ThreadsInternal::Active; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); // Inform spawning process that the threads_exec entry has been set. - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadsInternal::Active; } else { // Inform spawning process that the threads_exec entry could not be set. - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadsInternal::Terminating; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0; m_pool_size = 1; - m_pool_state = ThreadsExec::Inactive; + m_pool_state = ThreadsInternal::Inactive; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); } } -ThreadsExec::~ThreadsExec() { +ThreadsInternal::~ThreadsInternal() { const unsigned entry = m_pool_size - (m_pool_rank + 1); using Record = Kokkos::Impl::SharedAllocationRecord; @@ -230,28 +234,28 @@ ThreadsExec::~ThreadsExec() { m_pool_size = 0; m_pool_fan_size = 0; - m_pool_state = ThreadsExec::Terminating; + m_pool_state = ThreadsInternal::Terminating; if (&s_threads_process != this && entry < MAX_THREAD_COUNT) { - ThreadsExec *const nil = nullptr; + ThreadsInternal *const nil = nullptr; atomic_compare_exchange(s_threads_exec + entry, this, nil); - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadsInternal::Terminating; } } -int ThreadsExec::get_thread_count() { return s_thread_pool_size[0]; } +int ThreadsInternal::get_thread_count() { return s_thread_pool_size[0]; } -ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { - ThreadsExec *const th = +ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) { + ThreadsInternal *const th = init_thread_rank < s_thread_pool_size[0] ? s_threads_exec[s_thread_pool_size[0] - (init_thread_rank + 1)] : nullptr; if (nullptr == th || th->m_pool_rank != init_thread_rank) { std::ostringstream msg; - msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : " + msg << "Kokkos::Impl::ThreadsInternal::get_thread ERROR : " << "thread " << init_thread_rank << " of " << s_thread_pool_size[0]; if (nullptr == th) { msg << " does not exist"; @@ -266,9 +270,9 @@ ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { //---------------------------------------------------------------------------- -void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { - ThreadsExec::global_lock(); - ThreadsExec::global_unlock(); +void ThreadsInternal::execute_sleep(ThreadsInternal &exec, const void *) { + ThreadsInternal::global_lock(); + ThreadsInternal::global_unlock(); const int n = exec.m_pool_fan_size; const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); @@ -276,10 +280,10 @@ void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { for (int i = 0; i < n; ++i) { Impl::spinwait_while_equal( exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, - ThreadsExec::Active); + ThreadsInternal::Active); } - exec.m_pool_state = ThreadsExec::Inactive; + exec.m_pool_state = ThreadsInternal::Inactive; } } // namespace Impl @@ -290,8 +294,8 @@ void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { namespace Kokkos { namespace Impl { -void ThreadsExec::verify_is_process(const std::string &name, - const bool initialized) { +void ThreadsInternal::verify_is_process(const std::string &name, + const bool initialized) { if (!is_process()) { std::string msg(name); msg.append( @@ -307,33 +311,33 @@ void ThreadsExec::verify_is_process(const std::string &name, } } -int ThreadsExec::in_parallel() { +int ThreadsInternal::in_parallel() { // A thread function is in execution and // the function argument is not the special threads process argument and // the master process is a worker or is not the master process. return s_current_function && (&s_threads_process != s_current_function_arg) && (s_threads_process.m_pool_base || !is_process()); } -void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); } -void ThreadsExec::fence(const std::string &name) { +void ThreadsInternal::fence() { internal_fence(Impl::fence_is_static::yes); } +void ThreadsInternal::fence(const std::string &name) { internal_fence(name, Impl::fence_is_static::yes); } -void ThreadsExec::internal_fence(Impl::fence_is_static is_static) { +void ThreadsInternal::internal_fence(Impl::fence_is_static is_static) { internal_fence((is_static == Impl::fence_is_static::no) - ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence" - : "Kokkos::ThreadsExec::fence: Unnamed Static Fence", + ? "Kokkos::ThreadsInternal::fence: Unnamed Instance Fence" + : "Kokkos::ThreadsInternal::fence: Unnamed Static Fence", is_static); } // Wait for root thread to become inactive -void ThreadsExec::internal_fence(const std::string &name, - Impl::fence_is_static is_static) { +void ThreadsInternal::internal_fence(const std::string &name, + Impl::fence_is_static is_static) { const auto &fence_lam = [&]() { if (s_thread_pool_size[0]) { // Wait for the root thread to complete: Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, - ThreadsExec::Active); + ThreadsInternal::Active); } s_current_function = nullptr; @@ -357,13 +361,13 @@ void ThreadsExec::internal_fence(const std::string &name, } /** \brief Begin execution of the asynchronous functor */ -void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), - const void *arg) { - verify_is_process("ThreadsExec::start", true); +void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), + const void *arg) { + verify_is_process("ThreadsInternal::start", true); if (s_current_function || s_current_function_arg) { Kokkos::Impl::throw_runtime_exception( - std::string("ThreadsExec::start() FAILED : already executing")); + std::string("ThreadsInternal::start() FAILED : already executing")); } s_current_function = func; @@ -374,47 +378,47 @@ void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), // Activate threads: for (int i = s_thread_pool_size[0]; 0 < i--;) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Active; + s_threads_exec[i]->m_pool_state = ThreadsInternal::Active; } if (s_threads_process.m_pool_size) { // Master process is the root thread, run it: (*func)(s_threads_process, arg); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadsInternal::Inactive; } } //---------------------------------------------------------------------------- -bool ThreadsExec::sleep() { - verify_is_process("ThreadsExec::sleep", true); +bool ThreadsInternal::sleep() { + verify_is_process("ThreadsInternal::sleep", true); if (&execute_sleep == s_current_function) return false; fence(); - ThreadsExec::global_lock(); + ThreadsInternal::global_lock(); s_current_function = &execute_sleep; // Activate threads: for (unsigned i = s_thread_pool_size[0]; 0 < i;) { - s_threads_exec[--i]->m_pool_state = ThreadsExec::Active; + s_threads_exec[--i]->m_pool_state = ThreadsInternal::Active; } return true; } -bool ThreadsExec::wake() { - verify_is_process("ThreadsExec::wake", true); +bool ThreadsInternal::wake() { + verify_is_process("ThreadsInternal::wake", true); if (&execute_sleep != s_current_function) return false; - ThreadsExec::global_unlock(); + ThreadsInternal::global_unlock(); if (s_threads_process.m_pool_base) { execute_sleep(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadsInternal::Inactive; } fence(); @@ -424,10 +428,10 @@ bool ThreadsExec::wake() { //---------------------------------------------------------------------------- -void ThreadsExec::execute_resize_scratch_in_serial() { +void ThreadsInternal::execute_resize_scratch_in_serial() { const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; - auto deallocate_scratch_memory = [](ThreadsExec &exec) { + auto deallocate_scratch_memory = [](ThreadsInternal &exec) { if (exec.m_scratch) { using Record = Kokkos::Impl::SharedAllocationRecord; @@ -449,18 +453,18 @@ void ThreadsExec::execute_resize_scratch_in_serial() { memory_fence(); for (unsigned i = s_thread_pool_size[0]; begin < i;) { - ThreadsExec &th = *s_threads_exec[--i]; + ThreadsInternal &th = *s_threads_exec[--i]; - th.m_pool_state = ThreadsExec::Active; + th.m_pool_state = ThreadsInternal::Active; - wait_yield(th.m_pool_state, ThreadsExec::Active); + wait_yield(th.m_pool_state, ThreadsInternal::Active); } if (s_threads_process.m_pool_base) { deallocate_scratch_memory(s_threads_process); - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadsInternal::Active; first_touch_allocate_thread_private_scratch(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadsInternal::Inactive; } s_current_function_arg = nullptr; @@ -472,12 +476,12 @@ void ThreadsExec::execute_resize_scratch_in_serial() { //---------------------------------------------------------------------------- -void *ThreadsExec::root_reduce_scratch() { +void *ThreadsInternal::root_reduce_scratch() { return s_threads_process.reduce_memory(); } -void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, - const void *) { +void ThreadsInternal::first_touch_allocate_thread_private_scratch( + ThreadsInternal &exec, const void *) { exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end; exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end; @@ -505,7 +509,7 @@ void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, } } -void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { +void *ThreadsInternal::resize_scratch(size_t reduce_size, size_t thread_size) { enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; fence(); @@ -522,7 +526,7 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { if ((old_reduce_size < reduce_size) || (old_thread_size < thread_size) || ((reduce_size == 0 && thread_size == 0) && (old_reduce_size != 0 || old_thread_size != 0))) { - verify_is_process("ThreadsExec::resize_scratch", true); + verify_is_process("ThreadsInternal::resize_scratch", true); s_threads_process.m_scratch_reduce_end = reduce_size; s_threads_process.m_scratch_thread_end = reduce_size + thread_size; @@ -537,8 +541,8 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { //---------------------------------------------------------------------------- -void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { - verify_is_process("ThreadsExec::print_configuration", false); +void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { + verify_is_process("ThreadsInternal::print_configuration", false); fence(); @@ -575,7 +579,7 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { if (detail) { for (int i = 0; i < s_thread_pool_size[0]; ++i) { - ThreadsExec *const th = s_threads_exec[i]; + ThreadsInternal *const th = s_threads_exec[i]; if (th) { const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1); @@ -585,7 +589,7 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { s << " Fan{"; for (int j = 0; j < th->m_pool_fan_size; ++j) { - ThreadsExec *const thfan = th->m_pool_base[rank_rev + (1 << j)]; + ThreadsInternal *const thfan = th->m_pool_base[rank_rev + (1 << j)]; s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]"; } @@ -605,9 +609,9 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { //---------------------------------------------------------------------------- -int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; } +int ThreadsInternal::is_initialized() { return nullptr != s_threads_exec[0]; } -void ThreadsExec::initialize(int thread_count_arg) { +void ThreadsInternal::initialize(int thread_count_arg) { // legacy arguments unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; unsigned use_numa_count = 0; @@ -620,7 +624,7 @@ void ThreadsExec::initialize(int thread_count_arg) { unsigned thread_spawn_failed = 0; - for (int i = 0; i < ThreadsExec::MAX_THREAD_COUNT; i++) + for (int i = 0; i < ThreadsInternal::MAX_THREAD_COUNT; i++) s_threads_exec[i] = nullptr; if (!is_initialized) { @@ -659,7 +663,7 @@ void ThreadsExec::initialize(int thread_count_arg) { &execute_function_noop; // Initialization work function for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadsInternal::Inactive; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' @@ -675,18 +679,19 @@ void ThreadsExec::initialize(int thread_count_arg) { // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successful then // an entry in 's_threads_exec' will be assigned. - ThreadsExec::spawn(); - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); - if (s_threads_process.m_pool_state == ThreadsExec::Terminating) break; + ThreadsInternal::spawn(); + wait_yield(s_threads_process.m_pool_state, ThreadsInternal::Inactive); + if (s_threads_process.m_pool_state == ThreadsInternal::Terminating) break; } // Wait for all spawned threads to deactivate before zeroing the function. for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. - ThreadsExec *const th = ((ThreadsExec * volatile *)s_threads_exec)[ith]; + ThreadsInternal *const th = + ((ThreadsInternal * volatile *)s_threads_exec)[ith]; if (th) { - wait_yield(th->m_pool_state, ThreadsExec::Active); + wait_yield(th->m_pool_state, ThreadsInternal::Active); } else { ++thread_spawn_failed; } @@ -694,7 +699,7 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = nullptr; s_current_function_arg = nullptr; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadsInternal::Inactive; memory_fence(); @@ -728,7 +733,7 @@ void ThreadsExec::initialize(int thread_count_arg) { } // Initial allocations: - ThreadsExec::resize_scratch(1024, 1024); + ThreadsInternal::resize_scratch(1024, 1024); } else { s_thread_pool_size[0] = 0; s_thread_pool_size[1] = 0; @@ -773,8 +778,8 @@ void ThreadsExec::initialize(int thread_count_arg) { //---------------------------------------------------------------------------- -void ThreadsExec::finalize() { - verify_is_process("ThreadsExec::finalize", false); +void ThreadsInternal::finalize() { + verify_is_process("ThreadsInternal::finalize", false); fence(); @@ -784,18 +789,18 @@ void ThreadsExec::finalize() { for (unsigned i = s_thread_pool_size[0]; begin < i--;) { if (s_threads_exec[i]) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating; + s_threads_exec[i]->m_pool_state = ThreadsInternal::Terminating; - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); + wait_yield(s_threads_process.m_pool_state, ThreadsInternal::Inactive); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadsInternal::Inactive; } s_threads_pid[i] = std::thread::id(); } if (s_threads_process.m_pool_base) { - (&s_threads_process)->~ThreadsExec(); + (&s_threads_process)->~ThreadsInternal(); s_threads_exec[0] = nullptr; } @@ -814,7 +819,7 @@ void ThreadsExec::finalize() { s_threads_process.m_pool_rank = 0; s_threads_process.m_pool_size = 1; s_threads_process.m_pool_fan_size = 0; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadsInternal::Inactive; Kokkos::Profiling::finalize(); } @@ -836,7 +841,7 @@ int Threads::concurrency() const { return impl_thread_pool_size(0); } #endif void Threads::fence(const std::string &name) const { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no); + Impl::ThreadsInternal::internal_fence(name, Impl::fence_is_static::no); } Threads &Threads::impl_instance(int) { diff --git a/core/src/Threads/Kokkos_ThreadsExec.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp similarity index 82% rename from core/src/Threads/Kokkos_ThreadsExec.hpp rename to core/src/Threads/Kokkos_Threads_Instance.hpp index 377e096bfbe..dfbace60939 100644 --- a/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_THREADSEXEC_HPP -#define KOKKOS_THREADSEXEC_HPP +#ifndef KOKKOS_THREADS_INSTANCE_HPP +#define KOKKOS_THREADS_INSTANCE_HPP #include @@ -35,7 +35,7 @@ namespace Kokkos { namespace Impl { -class ThreadsExec { +class ThreadsInternal { public: // Fan array has log_2(NT) reduction threads plus 2 scan threads // Currently limited to 16k threads. @@ -67,7 +67,7 @@ class ThreadsExec { // the threads that need them. // For a simple reduction the thread location is arbitrary. - ThreadsExec *const *m_pool_base; ///< Base for pool fan-in + ThreadsInternal *const *m_pool_base; ///< Base for pool fan-in void *m_scratch; int m_scratch_reduce_end; @@ -95,12 +95,12 @@ class ThreadsExec { static void global_unlock(); static void spawn(); - static void first_touch_allocate_thread_private_scratch(ThreadsExec &, + static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); - static void execute_sleep(ThreadsExec &, const void *); + static void execute_sleep(ThreadsInternal &, const void *); - ThreadsExec(const ThreadsExec &); - ThreadsExec &operator=(const ThreadsExec &); + ThreadsInternal(const ThreadsInternal &); + ThreadsInternal &operator=(const ThreadsInternal &); static void execute_resize_scratch_in_serial(); @@ -112,7 +112,7 @@ class ThreadsExec { inline long team_work_index() const { return m_team_work_index; } static int get_thread_count(); - static ThreadsExec *get_thread(const int init_thread_rank); + static ThreadsInternal *get_thread(const int init_thread_rank); inline void *reduce_memory() const { return m_scratch; } KOKKOS_INLINE_FUNCTION void *scratch_memory() const { @@ -120,14 +120,14 @@ class ThreadsExec { } KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; } - KOKKOS_INLINE_FUNCTION ThreadsExec *const *pool_base() const { + KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } static void driver(void); - ~ThreadsExec(); - ThreadsExec(); + ~ThreadsInternal(); + ThreadsInternal(); static void *resize_scratch(size_t reduce_size, size_t thread_size); @@ -167,13 +167,15 @@ class ThreadsExec { for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadsInternal::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadsInternal::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + Impl::spinwait_while_equal(m_pool_state, + ThreadsInternal::Rendezvous); } else { // Root thread does the reduction and broadcast @@ -191,7 +193,7 @@ class ThreadsExec { memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadsInternal::Active; } } @@ -208,20 +210,22 @@ class ThreadsExec { for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadsInternal::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadsInternal::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + Impl::spinwait_while_equal(m_pool_state, + ThreadsInternal::Rendezvous); } else { // Root thread does the reduction and broadcast memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadsInternal::Active; } } } @@ -234,9 +238,10 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + Impl::spinwait_while_equal(fan.m_pool_state, + ThreadsInternal::Active); f.join( reinterpret_cast(reduce_memory()), @@ -266,7 +271,8 @@ class ThreadsExec { for (int i = 0; i < m_pool_fan_size; ++i) { Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadsInternal::Active); } } @@ -289,10 +295,11 @@ class ThreadsExec { //-------------------------------- // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: Active -> ReductionAvailable (or ScanAvailable) - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + Impl::spinwait_while_equal(fan.m_pool_state, + ThreadsInternal::Active); f.join(work_value, fan.reduce_memory()); } @@ -303,39 +310,41 @@ class ThreadsExec { if (rev_rank) { // Set: Active -> ReductionAvailable - m_pool_state = ThreadsExec::ReductionAvailable; + m_pool_state = ThreadsInternal::ReductionAvailable; // Wait for contributing threads' scan value to be available. if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) { - ThreadsExec &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; + ThreadsInternal &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; // Wait: Active -> ReductionAvailable // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(th.m_pool_state, ThreadsExec::Active); Impl::spinwait_while_equal(th.m_pool_state, - ThreadsExec::ReductionAvailable); + ThreadsInternal::Active); + Impl::spinwait_while_equal(th.m_pool_state, + ThreadsInternal::ReductionAvailable); f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count); } // This thread has completed inclusive scan // Set: ReductionAvailable -> ScanAvailable - m_pool_state = ThreadsExec::ScanAvailable; + m_pool_state = ThreadsInternal::ScanAvailable; // Wait for all threads to complete inclusive scan // Wait: ScanAvailable -> Rendezvous - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanAvailable); + Impl::spinwait_while_equal(m_pool_state, + ThreadsInternal::ScanAvailable); } //-------------------------------- for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: ReductionAvailable -> ScanAvailable Impl::spinwait_while_equal(fan.m_pool_state, - ThreadsExec::ReductionAvailable); + ThreadsInternal::ReductionAvailable); // Set: ScanAvailable -> Rendezvous - fan.m_pool_state = ThreadsExec::Rendezvous; + fan.m_pool_state = ThreadsInternal::Rendezvous; } // All threads have completed the inclusive scan. @@ -346,7 +355,7 @@ class ThreadsExec { if ((rev_rank + 1) < m_pool_size) { // Exclusive scan: copy the previous thread's inclusive scan value - ThreadsExec &th = *m_pool_base[rev_rank + 1]; // Not the root thread + ThreadsInternal &th = *m_pool_base[rev_rank + 1]; // Not the root thread const scalar_type *const src_value = ((scalar_type *)th.reduce_memory()) + count; @@ -364,17 +373,18 @@ class ThreadsExec { for (int i = 0; i < m_pool_fan_size; ++i) { Impl::spinwait_while_equal( m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsExec::Rendezvous); + ThreadsInternal::Rendezvous); } if (rev_rank) { // Set: ScanAvailable -> ScanCompleted - m_pool_state = ThreadsExec::ScanCompleted; + m_pool_state = ThreadsInternal::ScanCompleted; // Wait: ScanCompleted -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanCompleted); + Impl::spinwait_while_equal(m_pool_state, + ThreadsInternal::ScanCompleted); } // Set: ScanCompleted -> Active for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsInternal::Active; } } @@ -392,7 +402,8 @@ class ThreadsExec { for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadsInternal::Active); } for (unsigned i = 0; i < count; ++i) { @@ -400,9 +411,10 @@ class ThreadsExec { } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadsInternal::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + Impl::spinwait_while_equal(m_pool_state, + ThreadsInternal::Rendezvous); } else { // Root thread does the thread-scan before releasing threads @@ -424,7 +436,7 @@ class ThreadsExec { } for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsInternal::Active; } } @@ -433,7 +445,7 @@ class ThreadsExec { * complete and release the Threads device. * Acquire the Threads device and start this functor. */ - static void start(void (*)(ThreadsExec &, const void *), const void *); + static void start(void (*)(ThreadsInternal &, const void *), const void *); static int in_parallel(); static void fence(); @@ -583,30 +595,32 @@ class ThreadsExec { namespace Kokkos { -inline int Threads::in_parallel() { return Impl::ThreadsExec::in_parallel(); } +inline int Threads::in_parallel() { + return Impl::ThreadsInternal::in_parallel(); +} inline int Threads::impl_is_initialized() { - return Impl::ThreadsExec::is_initialized(); + return Impl::ThreadsInternal::is_initialized(); } inline void Threads::impl_initialize(InitializationSettings const &settings) { - Impl::ThreadsExec::initialize( + Impl::ThreadsInternal::initialize( settings.has_num_threads() ? settings.get_num_threads() : -1); } -inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); } +inline void Threads::impl_finalize() { Impl::ThreadsInternal::finalize(); } inline void Threads::print_configuration(std::ostream &os, bool verbose) const { os << "Host Parallel Execution Space:\n"; os << " KOKKOS_ENABLE_THREADS: yes\n"; os << "\nThreads Runtime Configuration:\n"; - Impl::ThreadsExec::print_configuration(os, verbose); + Impl::ThreadsInternal::print_configuration(os, verbose); } inline void Threads::impl_static_fence(const std::string &name) { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes); + Impl::ThreadsInternal::internal_fence(name, Impl::fence_is_static::yes); } } /* namespace Kokkos */ -#endif /* #define KOKKOS_THREADSEXEC_HPP */ +#endif diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 0828f262993..59577609ab7 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -46,54 +46,54 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); self.exec_range(range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 3698416ef18..4a89c4fad82 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -59,37 +59,37 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); ParallelFor::template exec_range(self.m_functor, range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = @@ -100,16 +100,16 @@ class ParallelFor, ? begin + self.m_policy.chunk_size() : self.m_policy.end(); ParallelFor::template exec_range(self.m_functor, begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index 36404857a22..f927d7c6a67 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -73,14 +73,14 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); ParallelFor::exec_team( - self.m_functor, Member(&exec, self.m_policy, self.m_shared)); + self.m_functor, Member(&instance, self.m_policy, self.m_shared)); - exec.barrier(); - exec.fan_in(); + instance.barrier(); + instance.fan_in(); } template Policy fix_policy(Policy policy) { @@ -96,12 +96,12 @@ class ParallelFor, public: inline void execute() const { - ThreadsExec::resize_scratch( + ThreadsInternal::resize_scratch( 0, Policy::member_type::team_reduce_size() + m_shared); - ThreadsExec::start(&ParallelFor::exec, this); + ThreadsInternal::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index 3d06379480f..fa63215a9e5 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -54,67 +54,67 @@ class ParallelReduce(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); self.exec_range( range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); - reference_type update = - self.m_reducer.init(static_cast(exec.reduce_memory())); + reference_type update = self.m_reducer.init( + static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(self.m_reducer); + instance.fan_in_reduce(self.m_reducer); } public: inline void execute() const { const ReducerType &reducer = m_iter.m_func.get_reducer(); - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp index 5fa97b403c4..bf4c2a532a1 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp @@ -68,42 +68,44 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); reference_type update = - reducer.init(static_cast(exec.reduce_memory())); + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index) * self.m_policy.chunk_size() + @@ -114,10 +116,10 @@ class ParallelReduce, : self.m_policy.end(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } public: @@ -130,15 +132,15 @@ class ParallelReduce, reducer.final(m_result_ptr); } } else { - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index c4b6100a9df..4db310701f9 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -58,16 +58,16 @@ class ParallelReduce( self.m_functor_reducer.get_functor(), - Member(&exec, self.m_policy, self.m_shared), + Member(&instance, self.m_policy, self.m_shared), self.m_functor_reducer.get_reducer().init( - static_cast(exec.reduce_memory()))); + static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(self.m_functor_reducer.get_reducer()); + instance.fan_in_reduce(self.m_functor_reducer.get_reducer()); } public: @@ -80,17 +80,17 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScan &self = *((const ParallelScan *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large( final_reducer ); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScan::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScan::exec, this); + ThreadsInternal::fence(); } ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) @@ -145,37 +145,37 @@ class ParallelScanWithTotal, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large(final_reducer); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); - if (exec.pool_rank() == exec.pool_size() - 1) { + if (instance.pool_rank() == instance.pool_size() - 1) { *self.m_result_ptr = update; } } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScanWithTotal::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScanWithTotal::exec, this); + ThreadsInternal::fence(); } template diff --git a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index d4ce697548f..c88d66db5f9 100644 --- a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP #include -#include +#include namespace Kokkos { namespace Impl { @@ -61,16 +61,17 @@ class ParallelFor, } } - static inline void thread_main(ThreadsExec& exec, const void* arg) noexcept { + static inline void thread_main(ThreadsInternal& instance, + const void* arg) noexcept { const Self& self = *(static_cast(arg)); self.exec_one_thread(); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() { - ThreadsExec::start(&Self::thread_main, this); - ThreadsExec::fence(); + ThreadsInternal::start(&Self::thread_main, this); + ThreadsInternal::fence(); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/core/src/decl/Kokkos_Declare_THREADS.hpp b/core/src/decl/Kokkos_Declare_THREADS.hpp index f5cbc0c1d1d..ffb44340453 100644 --- a/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -19,7 +19,7 @@ #if defined(KOKKOS_ENABLE_THREADS) #include -#include +#include #include #include #include From 5518eb99e2e6ed4e1b9c4975e08bea727e7dd820 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 12 Oct 2023 09:50:38 -0400 Subject: [PATCH 059/432] Promote Kokkos_Printf.hpp to public include --- core/src/Kokkos_Abort.hpp | 2 +- core/src/Kokkos_Core_fwd.hpp | 2 +- core/src/{impl => }/Kokkos_Printf.hpp | 6 +++--- core/src/SYCL/Kokkos_SYCL_Abort.hpp | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) rename core/src/{impl => }/Kokkos_Printf.hpp (93%) diff --git a/core/src/Kokkos_Abort.hpp b/core/src/Kokkos_Abort.hpp index 5639933ecb0..a8f38837ea0 100644 --- a/core/src/Kokkos_Abort.hpp +++ b/core/src/Kokkos_Abort.hpp @@ -18,7 +18,7 @@ #define KOKKOS_ABORT_HPP #include -#include +#include #ifdef KOKKOS_ENABLE_CUDA #include #endif diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index f6f4b0063e5..44f1c5b42f4 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -26,8 +26,8 @@ // and compiler environment then sets a collection of #define macros. #include +#include #include -#include #include #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 diff --git a/core/src/impl/Kokkos_Printf.hpp b/core/src/Kokkos_Printf.hpp similarity index 93% rename from core/src/impl/Kokkos_Printf.hpp rename to core/src/Kokkos_Printf.hpp index 4c1980cfcd0..39f95825c38 100644 --- a/core/src/impl/Kokkos_Printf.hpp +++ b/core/src/Kokkos_Printf.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PRINTF_HPP -#define KOKKOS_IMPL_PRINTF_HPP +#ifndef KOKKOS_PRINTF_HPP +#define KOKKOS_PRINTF_HPP #include @@ -51,4 +51,4 @@ KOKKOS_FUNCTION void printf(const char* format, Args... args) { } // namespace Kokkos -#endif /* #ifndef KOKKOS_IMPL_PRINTF_HPP */ +#endif /* #ifndef KOKKOS_PRINTF_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_Abort.hpp b/core/src/SYCL/Kokkos_SYCL_Abort.hpp index b8c2047d86b..4b0a142fe6c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Abort.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Abort.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOS_SYCL_ABORT_HPP #define KOKKOS_SYCL_ABORT_HPP -#include +#include #if defined(KOKKOS_ENABLE_SYCL) // FIXME_SYCL #if __has_include() From c586fa1722ab1907e0d7b2fd70a0b7ce252cbc9c Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Thu, 12 Oct 2023 09:38:34 -0600 Subject: [PATCH 060/432] simd: add floor, ceil, round, trunc operations (#6393) Added simd floor, ceil, round, trunc for all types --- simd/src/Kokkos_SIMD_AVX2.hpp | 373 +++-- simd/src/Kokkos_SIMD_AVX512.hpp | 1348 ++++++++++-------- simd/src/Kokkos_SIMD_Common.hpp | 15 +- simd/src/Kokkos_SIMD_NEON.hpp | 339 ++++- simd/src/Kokkos_SIMD_Scalar.hpp | 32 + simd/unit_tests/include/SIMDTesting_Ops.hpp | 80 ++ simd/unit_tests/include/TestSIMD_MathOps.hpp | 76 +- 7 files changed, 1495 insertions(+), 768 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 82f284d513e..521160b76fc 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -635,103 +635,139 @@ class simd> { } // namespace Experimental -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> copysign( - Experimental::simd> const& a, - Experimental::simd> const& b) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + copysign(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a, + Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& b) { __m256d const sign_mask = _mm256_set1_pd(-0.0); return Experimental::simd>( _mm256_xor_pd(_mm256_andnot_pd(sign_mask, static_cast<__m256d>(a)), _mm256_and_pd(sign_mask, static_cast<__m256d>(b)))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> abs( - Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { __m256d const sign_mask = _mm256_set1_pd(-0.0); return Experimental::simd>( _mm256_andnot_pd(sign_mask, static_cast<__m256d>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> sqrt( - Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_round_pd(static_cast<__m256d>(a), + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_round_pd(static_cast<__m256d>(a), + (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_round_pd(static_cast<__m256d>(a), + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_round_pd(static_cast<__m256d>(a), + (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + sqrt(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm256_sqrt_pd(static_cast<__m256d>(a))); } #ifdef __INTEL_COMPILER -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> cbrt( - Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + cbrt(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm256_cbrt_pd(static_cast<__m256d>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> exp( - Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + exp(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm256_exp_pd(static_cast<__m256d>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> log( - Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + log(Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm256_log_pd(static_cast<__m256d>(a))); } #endif -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> fma( - Experimental::simd> const& a, - Experimental::simd> const& b, - Experimental::simd> const& c) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + fma(Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& c) { return Experimental::simd>( _mm256_fmadd_pd(static_cast<__m256d>(a), static_cast<__m256d>(b), static_cast<__m256d>(c))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> max( - Experimental::simd> const& a, - Experimental::simd> const& b) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + max(Experimental::simd> const& a, + Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& b) { return Experimental::simd>( _mm256_max_pd(static_cast<__m256d>(a), static_cast<__m256d>(b))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> min( - Experimental::simd> const& a, - Experimental::simd> const& b) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + min(Experimental::simd> const& a, + Experimental::simd< + double, Experimental::simd_abi::avx2_fixed_size<4>> const& b) { return Experimental::simd>( _mm256_min_pd(static_cast<__m256d>(a), static_cast<__m256d>(b))); } namespace Experimental { -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> condition( - simd_mask> const& a, - simd> const& b, - simd> const& c) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { return simd>( _mm256_blendv_pd(static_cast<__m256d>(c), static_cast<__m256d>(b), static_cast<__m256d>(a))); @@ -838,8 +874,9 @@ class simd> { } // namespace Experimental -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> copysign( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> +copysign( Experimental::simd> const& a, Experimental::simd> const& @@ -850,54 +887,90 @@ Experimental::simd> copysign( _mm_and_ps(sign_mask, static_cast<__m128>(b)))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> abs( - Experimental::simd> const& - a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { __m128 const sign_mask = _mm_set1_ps(-0.0); return Experimental::simd>( _mm_andnot_ps(sign_mask, static_cast<__m128>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> sqrt( - Experimental::simd> const& - a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm_round_ps(static_cast<__m128>(a), + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm_round_ps(static_cast<__m128>(a), + (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm_round_ps(static_cast<__m128>(a), + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm_round_ps(static_cast<__m128>(a), + (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + sqrt(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm_sqrt_ps(static_cast<__m128>(a))); } #ifdef __INTEL_COMPILER -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> cbrt( - Experimental::simd> const& - a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + cbrt(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm_cbrt_ps(static_cast<__m128>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> exp( - Experimental::simd> const& - a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + exp(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm_exp_ps(static_cast<__m128>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> log( - Experimental::simd> const& - a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + log(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { return Experimental::simd>( _mm_log_ps(static_cast<__m128>(a))); } #endif -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> fma( - Experimental::simd> const& +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> +fma(Experimental::simd> const& a, Experimental::simd> const& b, @@ -908,9 +981,9 @@ Experimental::simd> fma( static_cast<__m128>(c))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> max( - Experimental::simd> const& +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> +max(Experimental::simd> const& a, Experimental::simd> const& b) { @@ -918,9 +991,9 @@ Experimental::simd> max( _mm_max_ps(static_cast<__m128>(a), static_cast<__m128>(b))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> min( - Experimental::simd> const& +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<4>> +min(Experimental::simd> const& a, Experimental::simd> const& b) { @@ -930,11 +1003,11 @@ Experimental::simd> min( namespace Experimental { -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> condition( - simd_mask> const& a, - simd> const& b, - simd> const& c) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { return simd>(_mm_blendv_ps( static_cast<__m128>(c), static_cast<__m128>(b), static_cast<__m128>(a))); } @@ -1064,16 +1137,48 @@ class simd> { } // namespace Experimental -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> -abs(Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { __m128i const rhs = static_cast<__m128i>(a); return Experimental::simd>( _mm_abs_epi32(rhs)); } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION @@ -1238,6 +1343,38 @@ class simd> { [&](std::size_t i) { return (a[i] < 0) ? -a[i] : a[i]; }); } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION @@ -1367,6 +1504,49 @@ simd>::simd( simd> const& other) : m_value(static_cast<__m256i>(other)) {} +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> +abs(Experimental::simd> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd>( + _mm256_setr_pd(a[0], a[1], a[2], a[3])); +} + +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -1386,17 +1566,6 @@ simd>::simd( } } -} // namespace Experimental - -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> -abs(Experimental::simd> const& a) { - return a; -} - -namespace Experimental { - template <> class const_where_expression>, simd>> { diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index 40a3bf375b4..c5d1717ad4e 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -141,11 +141,11 @@ class simd_mask> { }; template <> -class simd> { - __m256i m_value; +class simd> { + __m512d m_value; public: - using value_type = std::int32_t; + using value_type = double; using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask; using reference = value_type&; @@ -160,12 +160,10 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm256_set1_epi32(value_type(value))) {} + : m_value(_mm512_set1_pd(value_type(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m256i const& value_in) + __m512d const& value_in) : m_value(value_in) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd const& other); template > { bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( G&& gen) noexcept - : m_value( - _mm256_setr_epi32(gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()))) {} + : m_value(_mm512_setr_pd(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) { + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -191,122 +189,248 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( - value_type* ptr, element_aligned_tag) const { - _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), - m_value); - } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + m_value = _mm512_loadu_pd(ptr); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm512_storeu_pd(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const noexcept { - return simd(_mm256_sub_epi32(_mm256_set1_epi32(0), m_value)); + return simd(_mm512_sub_pd(_mm512_set1_pd(0.0), m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return simd( + _mm512_mul_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm512_div_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { - return simd>( - _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + return simd( + _mm512_add_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { - return simd>( - _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + return simd( + _mm512_sub_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmplt_epi32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs), + static_cast<__m512d>(rhs), _CMP_LT_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmplt_epi32_mask(static_cast<__m256i>(rhs), - static_cast<__m256i>(lhs))); + return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(rhs), + static_cast<__m512d>(lhs), _CMP_GT_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmple_epi32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs), + static_cast<__m512d>(rhs), _CMP_LE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmple_epi32_mask(static_cast<__m256i>(rhs), - static_cast<__m256i>(lhs))); + return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(rhs), + static_cast<__m512d>(lhs), _CMP_GE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmpeq_epi32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs), + static_cast<__m512d>(rhs), _CMP_EQ_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator!=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmpneq_epi32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); - } - - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, int rhs) noexcept { - return simd(_mm256_srai_epi32(static_cast<__m256i>(lhs), rhs)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_srav_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, int rhs) noexcept { - return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm512_cmp_pd_mask( + static_cast<__m512d>(lhs), static_cast<__m512d>(rhs), _CMP_NEQ_OS)); } }; } // namespace Experimental [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> -abs(Experimental::simd> const& a) { - __m256i const rhs = static_cast<__m256i>(a); - return Experimental::simd> +copysign( + Experimental::simd> const& a, + Experimental::simd> const& b) { + static const __m512i sign_mask = + reinterpret_cast<__m512i>(static_cast<__m512d>( + Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>>(-0.0))); + return Experimental::simd>( - _mm256_abs_epi32(rhs)); + reinterpret_cast<__m512d>(_mm512_xor_epi64( + _mm512_andnot_epi64( + sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(a))), + _mm512_and_epi64( + sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(b)))))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m512d const rhs = static_cast<__m512d>(a); +#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 830) + return Experimental::simd>( + (__m512d)_mm512_and_epi64((__m512i)rhs, + _mm512_set1_epi64(0x7fffffffffffffffLL))); +#else + return Experimental::simd>( + _mm512_abs_pd(rhs)); +#endif +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m512d const val = static_cast<__m512d>(a); + return Experimental::simd>( + _mm512_roundscale_pd(val, _MM_FROUND_TO_NEG_INF)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m512d const val = static_cast<__m512d>(a); + return Experimental::simd>( + _mm512_roundscale_pd(val, _MM_FROUND_TO_POS_INF)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m512d const val = static_cast<__m512d>(a); + return Experimental::simd>( + _mm512_roundscale_pd(val, _MM_FROUND_TO_NEAREST_INT)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m512d const val = static_cast<__m512d>(a); + return Experimental::simd>( + _mm512_roundscale_pd(val, _MM_FROUND_TO_ZERO)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + sqrt(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_sqrt_pd(static_cast<__m512d>(a))); +} + +#ifdef __INTEL_COMPILER + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + cbrt(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cbrt_pd(static_cast<__m512d>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + exp(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_exp_pd(static_cast<__m512d>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + log(Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_log_pd(static_cast<__m512d>(a))); +} + +#endif + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +fma(Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd> const& c) { + return Experimental::simd>( + _mm512_fmadd_pd(static_cast<__m512d>(a), static_cast<__m512d>(b), + static_cast<__m512d>(c))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +max(Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( + _mm512_max_pd(static_cast<__m512d>(a), static_cast<__m512d>(b))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +min(Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( + _mm512_min_pd(static_cast<__m512d>(a), static_cast<__m512d>(b))); } namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - condition(simd_mask> const& a, - simd> const& b, - simd> const& c) { - return simd>( - _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), - static_cast<__m256i>(b))); + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm512_mask_blend_pd(static_cast<__mmask8>(a), static_cast<__m512d>(c), + static_cast<__m512d>(b))); } template <> -class simd> { - __m256i m_value; +class simd> { + __m256 m_value; public: - using value_type = std::uint32_t; + using value_type = float; using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask; using reference = value_type&; @@ -321,32 +445,25 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm256_set1_epi32( - Kokkos::bit_cast(value_type(value)))) {} + : m_value(_mm256_set1_ps(value_type(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m256i const& value_in) + __m256 const& value_in) : m_value(value_in) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd> const& other) - : m_value(static_cast<__m256i>(other)) {} template ()); } std::is_invocable_r_v>, bool> = false> - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - G&& gen) noexcept - : m_value( - _mm256_setr_epi32(gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) + : m_value(_mm256_setr_ps(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) { + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -354,115 +471,225 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( - value_type* ptr, element_aligned_tag) const { - _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), - m_value); - } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + m_value = _mm256_loadu_ps(ptr); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm256_storeu_ps(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const + noexcept { + return simd(_mm256_sub_ps(_mm256_set1_ps(0.0), m_value)); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return simd(_mm256_mul_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_div_ps(lhs.m_value, rhs.m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + return simd(_mm256_add_ps(lhs.m_value, rhs.m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + return simd(_mm256_sub_ps(lhs.m_value, rhs.m_value)); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LT_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(rhs), - static_cast<__m256i>(lhs))); + return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GT_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(rhs), - static_cast<__m256i>(lhs))); + return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmpeq_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_EQ_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator!=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmpneq_epu32_mask(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); - } - - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, int rhs) noexcept { - return simd(_mm256_srli_epi32(static_cast<__m256i>(lhs), rhs)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_srlv_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, int rhs) noexcept { - return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), - static_cast<__m256i>(rhs))); + return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_NEQ_OS)); } }; } // namespace Experimental -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> -abs(Experimental::simd> +copysign( + Experimental::simd> const& a, + Experimental::simd> const& b) { + __m256 const sign_mask = _mm256_set1_ps(-0.0); + return Experimental::simd>( + _mm256_xor_ps(_mm256_andnot_ps(sign_mask, static_cast<__m256>(a)), + _mm256_and_ps(sign_mask, static_cast<__m256>(b)))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> abs( + Experimental::simd> const& a) { - return a; + __m256 const sign_mask = _mm256_set1_ps(-0.0); + return Experimental::simd>( + _mm256_andnot_ps(sign_mask, static_cast<__m256>(a))); } -namespace Experimental { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m256 const val = static_cast<__m256>(a); + return Experimental::simd>( + _mm256_roundscale_ps(val, _MM_FROUND_TO_NEG_INF)); +} [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - condition(simd_mask> const& a, - simd> const& b, - simd> const& c) { - return simd>( - _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), - static_cast<__m256i>(b))); + Experimental::simd> + ceil(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m256 const val = static_cast<__m256>(a); + return Experimental::simd>( + _mm256_roundscale_ps(val, _MM_FROUND_TO_POS_INF)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m256 const val = static_cast<__m256>(a); + return Experimental::simd>( + _mm256_roundscale_ps(val, _MM_FROUND_TO_NEAREST_INT)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + __m256 const val = static_cast<__m256>(a); + return Experimental::simd>( + _mm256_roundscale_ps(val, _MM_FROUND_TO_ZERO)); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> sqrt( + Experimental::simd> const& a) { + return Experimental::simd>( + _mm256_sqrt_ps(static_cast<__m256>(a))); +} + +#ifdef __INTEL_COMPILER + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> cbrt( + Experimental::simd> const& a) { + return Experimental::simd>( + _mm256_cbrt_ps(static_cast<__m256>(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> exp( + Experimental::simd> const& a) { + return Experimental::simd>( + _mm256_exp_ps(static_cast<__m256>(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> log( + Experimental::simd> const& a) { + return Experimental::simd>( + _mm256_log_ps(static_cast<__m256>(a))); +} + +#endif + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> fma( + Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd> const& c) { + return Experimental::simd>( + _mm256_fmadd_ps(static_cast<__m256>(a), static_cast<__m256>(b), + static_cast<__m256>(c))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> max( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( + _mm256_max_ps(static_cast<__m256>(a), static_cast<__m256>(b))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd> min( + Experimental::simd> const& a, + Experimental::simd> const& b) { + return Experimental::simd>( + _mm256_min_ps(static_cast<__m256>(a), static_cast<__m256>(b))); +} + +namespace Experimental { + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd> condition( + simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm256_mask_blend_ps(static_cast<__mmask8>(a), static_cast<__m256>(c), + static_cast<__m256>(b))); } template <> -class simd> { - __m512i m_value; +class simd> { + __m256i m_value; public: - using value_type = std::int64_t; + using value_type = std::int32_t; using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask; using reference = value_type&; @@ -477,12 +704,12 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm512_set1_epi64(value_type(value))) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd> const& other) - : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {} + : m_value(_mm256_set1_epi32(value_type(value))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m256i const& value_in) + : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd> const& other); + simd const& other); template > { KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( G&& gen) noexcept : m_value( - _mm512_setr_epi64(gen(std::integral_constant()), + _mm256_setr_epi32(gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), @@ -501,8 +728,6 @@ class simd> { gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()))) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in) - : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -510,122 +735,158 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm512_loadu_si512(ptr); - } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm512_storeu_si512(ptr, m_value); + _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const noexcept { - return simd(_mm512_sub_epi64(_mm512_set1_epi64(0), m_value)); + return simd(_mm256_sub_epi32(_mm256_set1_epi32(0), m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm512_mullo_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + return simd>( + _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + return simd>( + _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmplt_epi64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return mask_type(_mm256_cmplt_epi32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmplt_epi64_mask(static_cast<__m512i>(rhs), - static_cast<__m512i>(lhs))); + return mask_type(_mm256_cmplt_epi32_mask(static_cast<__m256i>(rhs), + static_cast<__m256i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmple_epi64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return mask_type(_mm256_cmple_epi32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmple_epi64_mask(static_cast<__m512i>(rhs), - static_cast<__m512i>(lhs))); + return mask_type(_mm256_cmple_epi32_mask(static_cast<__m256i>(rhs), + static_cast<__m256i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmpeq_epi64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return mask_type(_mm256_cmpeq_epi32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator!=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmpneq_epi64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return mask_type(_mm256_cmpneq_epi32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, int rhs) { - return simd(_mm512_srai_epi64(static_cast<__m512i>(lhs), rhs)); + simd const& lhs, int rhs) noexcept { + return simd(_mm256_srai_epi32(static_cast<__m256i>(lhs), rhs)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, simd const& rhs) { - return simd(_mm512_srav_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_srav_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, int rhs) { - return simd(_mm512_slli_epi64(static_cast<__m512i>(lhs), rhs)); + simd const& lhs, int rhs) noexcept { + return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, simd const& rhs) { - return simd(_mm512_sllv_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } }; } // namespace Experimental [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> -abs(Experimental::simd> +abs(Experimental::simd> const& a) { - __m512i const rhs = static_cast<__m512i>(a); - return Experimental::simd(a); + return Experimental::simd>( - _mm512_abs_epi64(rhs)); + _mm256_abs_epi32(rhs)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +round(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepi32_pd(static_cast<__m256i>(a))); } namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - condition(simd_mask> const& a, - simd> const& b, - simd> const& c) { - return simd>( - _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c), - static_cast<__m512i>(b))); + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), + static_cast<__m256i>(b))); } template <> -class simd> { - __m512i m_value; +class simd> { + __m256i m_value; public: - using value_type = std::uint64_t; + using value_type = std::uint32_t; using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask; using reference = value_type&; @@ -640,13 +901,14 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm512_set1_epi64( - Kokkos::bit_cast(value_type(value)))) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in) + : m_value(_mm256_set1_epi32( + Kokkos::bit_cast(value_type(value)))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m256i const& value_in) : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd const& other) - : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {} + simd> const& other) + : m_value(static_cast<__m256i>(other)) {} template > { KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( G&& gen) noexcept : m_value( - _mm512_setr_epi64(gen(std::integral_constant()), + _mm256_setr_epi32(gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()), @@ -665,9 +927,6 @@ class simd> { gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()))) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd const& other) - : m_value(static_cast<__m512i>(other)) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -675,132 +934,151 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm512_loadu_si512(ptr); - } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm512_storeu_si512(ptr, m_value); + _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm512_mullo_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { return simd( - _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { return simd( - _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, int rhs) noexcept { - return _mm512_srli_epi64(static_cast<__m512i>(lhs), rhs); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, simd const& rhs) noexcept { - return _mm512_srlv_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, int rhs) noexcept { - return _mm512_slli_epi64(static_cast<__m512i>(lhs), rhs); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, simd const& rhs) noexcept { - return _mm512_sllv_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( - simd const& lhs, simd const& rhs) noexcept { - return _mm512_and_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator|( - simd const& lhs, simd const& rhs) noexcept { - return _mm512_or_epi64(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs)); + _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmplt_epu64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmplt_epu64_mask(static_cast<__m512i>(rhs), - static_cast<__m512i>(lhs))); + return mask_type(_mm256_cmplt_epu32_mask(static_cast<__m256i>(rhs), + static_cast<__m256i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmple_epu64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmple_epu64_mask(static_cast<__m512i>(rhs), - static_cast<__m512i>(lhs))); + return mask_type(_mm256_cmple_epu32_mask(static_cast<__m256i>(rhs), + static_cast<__m256i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpeq_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpneq_epu32_mask(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_srli_epi32(static_cast<__m256i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_srlv_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmpeq_epu64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator!=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmpneq_epu64_mask(static_cast<__m512i>(lhs), - static_cast<__m512i>(rhs))); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); } }; } // namespace Experimental [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> -abs(Experimental::simd> +abs(Experimental::simd> const& a) { return a; } -namespace Experimental { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +floor(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - condition(simd_mask> const& a, - simd> const& b, - simd> const& c) { - return simd>( - _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c), - static_cast<__m512i>(b))); +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +ceil(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd>::simd( - simd> const& other) - : m_value(_mm512_cvtepi64_epi32(static_cast<__m512i>(other))) {} +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +round(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd>::simd( - simd> const& other) - : m_value(static_cast<__m512i>(other)) {} +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +trunc(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), + static_cast<__m256i>(b))); +} template <> -class simd> { - __m512d m_value; +class simd> { + __m512i m_value; public: - using value_type = double; + using value_type = std::int64_t; using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask; using reference = value_type&; @@ -815,10 +1093,12 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm512_set1_pd(value_type(value))) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m512d const& value_in) - : m_value(value_in) {} + : m_value(_mm512_set1_epi64(value_type(value))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd> const& other) + : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd> const& other); template > { bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( G&& gen) noexcept - : m_value(_mm512_setr_pd(gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()))) { - } + : m_value( + _mm512_setr_epi64(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in) + : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -846,206 +1128,156 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - m_value = _mm512_loadu_pd(ptr); + m_value = _mm512_loadu_si512(ptr); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm512_storeu_pd(ptr, m_value); + _mm512_storeu_si512(ptr, m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const noexcept { - return simd(_mm512_sub_pd(_mm512_set1_pd(0.0), m_value)); + return simd(_mm512_sub_epi64(_mm512_set1_epi64(0), m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm512_mul_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm512_div_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); + return simd(_mm512_mullo_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { return simd( - _mm512_add_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); + _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { return simd( - _mm512_sub_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs))); + _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs), - static_cast<__m512d>(rhs), _CMP_LT_OS)); + return mask_type(_mm512_cmplt_epi64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(rhs), - static_cast<__m512d>(lhs), _CMP_GT_OS)); + return mask_type(_mm512_cmplt_epi64_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs), - static_cast<__m512d>(rhs), _CMP_LE_OS)); + return mask_type(_mm512_cmple_epi64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(rhs), - static_cast<__m512d>(lhs), _CMP_GE_OS)); + return mask_type(_mm512_cmple_epi64_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmp_pd_mask(static_cast<__m512d>(lhs), - static_cast<__m512d>(rhs), _CMP_EQ_OS)); + return mask_type(_mm512_cmpeq_epi64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator!=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm512_cmp_pd_mask( - static_cast<__m512d>(lhs), static_cast<__m512d>(rhs), _CMP_NEQ_OS)); + return mask_type(_mm512_cmpneq_epi64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) { + return simd(_mm512_srai_epi64(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) { + return simd(_mm512_srav_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) { + return simd(_mm512_slli_epi64(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) { + return simd(_mm512_sllv_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } }; } // namespace Experimental [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> -copysign( - Experimental::simd> const& a, - Experimental::simd> const& b) { - static const __m512i sign_mask = - reinterpret_cast<__m512i>(static_cast<__m512d>( - Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>>(-0.0))); - return Experimental::simd>( - reinterpret_cast<__m512d>(_mm512_xor_epi64( - _mm512_andnot_epi64( - sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(a))), - _mm512_and_epi64( - sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(b)))))); -} - -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd> - abs(Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - __m512d const rhs = static_cast<__m512d>(a); -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 830) - return Experimental::simd>( - (__m512d)_mm512_and_epi64((__m512i)rhs, - _mm512_set1_epi64(0x7fffffffffffffffLL))); -#else - return Experimental::simd>( - _mm512_abs_pd(rhs)); -#endif -} - -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd> - sqrt(Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd>( - _mm512_sqrt_pd(static_cast<__m512d>(a))); -} - -#ifdef __INTEL_COMPILER - -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd> - cbrt(Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd> +abs(Experimental::simd> const& a) { + __m512i const rhs = static_cast<__m512i>(a); + return Experimental::simd>( - _mm512_cbrt_pd(static_cast<__m512d>(a))); + _mm512_abs_epi64(rhs)); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd> - exp(Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +floor(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { return Experimental::simd>( - _mm512_exp_pd(static_cast<__m512d>(a))); + _mm512_cvtepi64_pd(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd> - log(Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd>( - _mm512_log_pd(static_cast<__m512d>(a))); -} - -#endif - -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> -fma(Experimental::simd> const& a, - Experimental::simd> const& b, - Experimental::simd> const& c) { + ceil(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { return Experimental::simd>( - _mm512_fmadd_pd(static_cast<__m512d>(a), static_cast<__m512d>(b), - static_cast<__m512d>(c))); + _mm512_cvtepi64_pd(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< double, Experimental::simd_abi::avx512_fixed_size<8>> -max(Experimental::simd> const& a, - Experimental::simd> const& b) { +round(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { return Experimental::simd>( - _mm512_max_pd(static_cast<__m512d>(a), static_cast<__m512d>(b))); + _mm512_cvtepi64_pd(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< double, Experimental::simd_abi::avx512_fixed_size<8>> -min(Experimental::simd> const& a, - Experimental::simd> const& b) { +trunc(Experimental::simd< + std::int64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { return Experimental::simd>( - _mm512_min_pd(static_cast<__m512d>(a), static_cast<__m512d>(b))); + _mm512_cvtepi64_pd(static_cast<__m512i>(a))); } namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd> - condition(simd_mask> const& a, - simd> const& b, - simd> const& c) { - return simd>( - _mm512_mask_blend_pd(static_cast<__mmask8>(a), static_cast<__m512d>(c), - static_cast<__m512d>(b))); + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c), + static_cast<__m512i>(b))); } template <> -class simd> { - __m256 m_value; +class simd> { + __m512i m_value; public: - using value_type = float; + using value_type = std::uint64_t; using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask; using reference = value_type&; @@ -1060,25 +1292,34 @@ class simd> { template , bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm256_set1_ps(value_type(value))) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m256 const& value_in) + : m_value(_mm512_set1_epi64( + Kokkos::bit_cast(value_type(value)))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in) : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd const& other) + : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {} template ()); } std::is_invocable_r_v>, bool> = false> - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) - : m_value(_mm256_setr_ps(gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()), - gen(std::integral_constant()))) { - } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value( + _mm512_setr_epi64(gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()), + gen(std::integral_constant()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd const& other) + : m_value(static_cast<__m512i>(other)) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -1088,176 +1329,159 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - m_value = _mm256_loadu_ps(ptr); + m_value = _mm512_loadu_si512(ptr); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm256_storeu_ps(ptr, m_value); + _mm512_storeu_si512(ptr, m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { - return simd(_mm256_sub_ps(_mm256_set1_ps(0.0), m_value)); - } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_mul_ps(lhs.m_value, rhs.m_value)); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( - simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_div_ps(lhs.m_value, rhs.m_value)); + return simd(_mm512_mullo_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_add_ps(lhs.m_value, rhs.m_value)); + return simd( + _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { - return simd(_mm256_sub_ps(lhs.m_value, rhs.m_value)); + return simd( + _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return _mm512_srli_epi64(static_cast<__m512i>(lhs), rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return _mm512_srlv_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return _mm512_slli_epi64(static_cast<__m512i>(lhs), rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return _mm512_sllv_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( + simd const& lhs, simd const& rhs) noexcept { + return _mm512_and_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator|( + simd const& lhs, simd const& rhs) noexcept { + return _mm512_or_epi64(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs)); } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LT_OS)); + return mask_type(_mm512_cmplt_epu64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GT_OS)); + return mask_type(_mm512_cmplt_epu64_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LE_OS)); + return mask_type(_mm512_cmple_epu64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GE_OS)); + return mask_type(_mm512_cmple_epu64_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_EQ_OS)); + return mask_type(_mm512_cmpeq_epu64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator!=(simd const& lhs, simd const& rhs) noexcept { - return mask_type(_mm256_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_NEQ_OS)); + return mask_type(_mm512_cmpneq_epu64_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); } }; } // namespace Experimental -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> -copysign( - Experimental::simd> const& a, - Experimental::simd> const& b) { - __m256 const sign_mask = _mm256_set1_ps(-0.0); - return Experimental::simd>( - _mm256_xor_ps(_mm256_andnot_ps(sign_mask, static_cast<__m256>(a)), - _mm256_and_ps(sign_mask, static_cast<__m256>(b)))); -} - -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> abs( - Experimental::simd> +abs(Experimental::simd> const& a) { - __m256 const sign_mask = _mm256_set1_ps(-0.0); - return Experimental::simd>( - _mm256_andnot_ps(sign_mask, static_cast<__m256>(a))); + return a; } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> sqrt( - Experimental::simd> const& a) { - return Experimental::simd> +floor(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( - _mm256_sqrt_ps(static_cast<__m256>(a))); + _mm512_cvtepu64_pd(static_cast<__m512i>(a))); } -#ifdef __INTEL_COMPILER - -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> cbrt( - Experimental::simd> const& a) { - return Experimental::simd> +ceil(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( - _mm256_cbrt_ps(static_cast<__m256>(a))); + _mm512_cvtepu64_pd(static_cast<__m512i>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> exp( - Experimental::simd> const& a) { - return Experimental::simd> +round(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( - _mm256_exp_ps(static_cast<__m256>(a))); + _mm512_cvtepu64_pd(static_cast<__m512i>(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> log( - Experimental::simd> const& a) { - return Experimental::simd> +trunc(Experimental::simd< + std::uint64_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd>( - _mm256_log_ps(static_cast<__m256>(a))); + _mm512_cvtepu64_pd(static_cast<__m512i>(a))); } -#endif - -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> fma( - Experimental::simd> const& a, - Experimental::simd> const& b, - Experimental::simd> const& c) { - return Experimental::simd>( - _mm256_fmadd_ps(static_cast<__m256>(a), static_cast<__m256>(b), - static_cast<__m256>(c))); -} +namespace Experimental { -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> max( - Experimental::simd> const& a, - Experimental::simd> const& b) { - return Experimental::simd>( - _mm256_max_ps(static_cast<__m256>(a), static_cast<__m256>(b))); +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { + return simd>( + _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c), + static_cast<__m512i>(b))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> min( - Experimental::simd> const& a, - Experimental::simd> const& b) { - return Experimental::simd>( - _mm256_min_ps(static_cast<__m256>(a), static_cast<__m256>(b))); -} - -namespace Experimental { +simd>::simd( + simd> const& other) + : m_value(_mm512_cvtepi64_epi32(static_cast<__m512i>(other))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> condition( - simd_mask> const& a, - simd> const& b, - simd> const& c) { - return simd>( - _mm256_mask_blend_ps(static_cast<__mmask8>(a), static_cast<__m256>(c), - static_cast<__m256>(b))); -} +simd>::simd( + simd> const& other) + : m_value(static_cast<__m512i>(other)) {} template <> class const_where_expression>, diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index a54ce37a6c4..87edf994533 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -328,8 +328,21 @@ template return a == simd_mask(false); } -} // namespace Experimental +// A temporary device-callable implemenation of round half to nearest even +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round_half_to_nearest_even( + T const& x) { + auto ceil = Kokkos::ceil(x); + auto floor = Kokkos::floor(x); + + if (Kokkos::abs(ceil - x) == Kokkos::abs(floor - x)) { + auto rem = Kokkos::remainder(ceil, 2.0); + return (rem == 0) ? ceil : floor; + } + return Kokkos::round(x); +} +} // namespace Experimental } // namespace Kokkos #endif diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 6d82294c46d..43ece203890 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -431,20 +431,52 @@ class simd> { } // namespace Experimental -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> abs( - Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& a) { return Experimental::simd>( vabsq_f64(static_cast(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> copysign( - Experimental::simd> const& a, - Experimental::simd> const& b) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrndmq_f64(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrndpq_f64(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrndxq_f64(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrndq_f64(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + copysign(Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& a, + Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& b) { uint64x2_t const sign_mask = vreinterpretq_u64_f64(vmovq_n_f64(-0.0)); return Experimental::simd>( vreinterpretq_f64_u64(vorrq_u64( @@ -453,43 +485,43 @@ Experimental::simd> copysign( vreinterpretq_u64_f64(static_cast(b)))))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> sqrt( - Experimental::simd> const& a) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + sqrt(Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& a) { return Experimental::simd>( vsqrtq_f64(static_cast(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> fma( - Experimental::simd> const& a, - Experimental::simd> const& b, - Experimental::simd> const& c) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + fma(Experimental::simd> const& a, + Experimental::simd> const& b, + Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& c) { return Experimental::simd>( vfmaq_f64(static_cast(c), static_cast(b), static_cast(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> max( - Experimental::simd> const& a, - Experimental::simd> const& b) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + max(Experimental::simd> const& a, + Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& b) { return Experimental::simd>( vmaxq_f64(static_cast(a), static_cast(b))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -Experimental::simd> min( - Experimental::simd> const& a, - Experimental::simd> const& b) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + min(Experimental::simd> const& a, + Experimental::simd< + double, Experimental::simd_abi::neon_fixed_size<2>> const& b) { return Experimental::simd>( vminq_f64(static_cast(a), static_cast(b))); } @@ -630,62 +662,111 @@ class simd> { } }; -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { - return simd>( +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( vabs_f32(static_cast(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> copysign( - simd> const& a, - simd> const& b) { +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrndm_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrndp_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrndx_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( + vrnd_f32(static_cast(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> +copysign( + Experimental::simd> const& + a, + Experimental::simd> const& + b) { uint32x2_t const sign_mask = vreinterpret_u32_f32(vmov_n_f32(-0.0)); - return simd>(vreinterpret_f32_u32( - vorr_u32(vreinterpret_u32_f32(static_cast(abs(a))), - vand_u32(sign_mask, - vreinterpret_u32_f32(static_cast(b)))))); + return Experimental::simd>( + vreinterpret_f32_u32(vorr_u32( + vreinterpret_u32_f32(static_cast(abs(a))), + vand_u32(sign_mask, + vreinterpret_u32_f32(static_cast(b)))))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> sqrt( - simd> const& a) { - return simd>( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + sqrt(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( vsqrt_f32(static_cast(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> fma( - simd> const& a, - simd> const& b, - simd> const& c) { - return simd>( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> +fma(Experimental::simd> const& + a, + Experimental::simd> const& + b, + Experimental::simd> const& + c) { + return Experimental::simd>( vfma_f32(static_cast(c), static_cast(b), static_cast(a))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> max( - simd> const& a, - simd> const& b) { - return simd>( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> +max(Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( vmax_f32(static_cast(a), static_cast(b))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> min( - simd> const& a, - simd> const& b) { - return simd>( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<2>> +min(Experimental::simd> const& + a, + Experimental::simd> const& + b) { + return Experimental::simd>( vmin_f32(static_cast(a), static_cast(b))); } -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> condition( - simd_mask> const& a, - simd> const& b, - simd> const& c) { +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd> + condition(simd_mask> const& a, + simd> const& b, + simd> const& c) { return simd>( vbsl_f32(static_cast(a), static_cast(b), static_cast(c))); @@ -840,13 +921,47 @@ class simd> { } }; -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { - return simd>( +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( vabs_s32(static_cast(a))); } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -1006,13 +1121,47 @@ class simd> { } }; -KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION -simd> abs( - simd> const& a) { - return simd>( +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + abs(Experimental::simd< + std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd>( vabsq_s64(static_cast(a))); } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + floor(Experimental::simd< + std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + ceil(Experimental::simd< + std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + round(Experimental::simd< + std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd> + trunc(Experimental::simd< + std::int64_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, @@ -1173,6 +1322,38 @@ simd>::simd( return a; } +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>> +floor(Experimental::simd> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>> +ceil(Experimental::simd> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>> +round(Experimental::simd> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint64_t, Experimental::simd_abi::neon_fixed_size<2>> +trunc(Experimental::simd> const& a) { + return a; +} + +namespace Experimental { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd> condition(simd_mask> const& a, diff --git a/simd/src/Kokkos_SIMD_Scalar.hpp b/simd/src/Kokkos_SIMD_Scalar.hpp index f10547e15d0..af7cb1e2c61 100644 --- a/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/simd/src/Kokkos_SIMD_Scalar.hpp @@ -218,6 +218,38 @@ template return a; } +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto floor( + Experimental::simd const& a) { + using data_type = std::conditional_t, T, double>; + return Experimental::simd( + Kokkos::floor(static_cast(a[0]))); +}; + +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil( + Experimental::simd const& a) { + using data_type = std::conditional_t, T, double>; + return Experimental::simd( + Kokkos::ceil(static_cast(a[0]))); +}; + +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round( + Experimental::simd const& a) { + using data_type = std::conditional_t, T, double>; + return Experimental::simd( + Experimental::round_half_to_nearest_even(static_cast(a[0]))); +}; + +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc( + Experimental::simd const& a) { + using data_type = std::conditional_t, T, double>; + return Experimental::simd( + Kokkos::trunc(static_cast(a[0]))); +}; + template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index 60dff68f309..6529f20e66a 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -105,6 +105,86 @@ class absolutes { } }; +class floors { + public: + template + auto on_host(T const& a) const { + return Kokkos::floor(a); + } + template + auto on_host_serial(T const& a) const { + return Kokkos::floor(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::floor(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + return Kokkos::floor(a); + } +}; + +class ceils { + public: + template + auto on_host(T const& a) const { + return Kokkos::ceil(a); + } + template + auto on_host_serial(T const& a) const { + return Kokkos::ceil(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::ceil(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + return Kokkos::ceil(a); + } +}; + +class rounds { + public: + template + auto on_host(T const& a) const { + return Kokkos::round(a); + } + template + auto on_host_serial(T const& a) const { + return std::rint(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::round(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + return Kokkos::Experimental::round_half_to_nearest_even(a); + } +}; + +class truncates { + public: + template + auto on_host(T const& a) const { + return Kokkos::trunc(a); + } + template + auto on_host_serial(T const& a) const { + return Kokkos::trunc(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::trunc(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + return Kokkos::trunc(a); + } +}; + class shift_right { public: template diff --git a/simd/unit_tests/include/TestSIMD_MathOps.hpp b/simd/unit_tests/include/TestSIMD_MathOps.hpp index 2d6557d1a7c..802e41efe5f 100644 --- a/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -61,12 +61,13 @@ void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n, simd_type arg; bool const loaded_arg = loader.host_load(args + i, nlanes, arg); if (!loaded_arg) continue; - simd_type expected_result; + auto computed_result = unary_op.on_host(arg); + + decltype(computed_result) expected_result; for (std::size_t lane = 0; lane < simd_type::size(); ++lane) { if (lane < nlanes) expected_result[lane] = unary_op.on_host_serial(T(arg[lane])); } - simd_type const computed_result = unary_op.on_host(arg); host_check_equality(expected_result, computed_result, nlanes); } } @@ -85,12 +86,17 @@ inline void host_check_all_math_ops(const DataType (&first_args)[n], host_check_math_op_all_loaders(plus(), n, first_args, second_args); host_check_math_op_all_loaders(minus(), n, first_args, second_args); host_check_math_op_all_loaders(multiplies(), n, first_args, second_args); + host_check_math_op_all_loaders(absolutes(), n, first_args); - // TODO: Place fallback division implementations for all simd integer types - if constexpr (std::is_same_v) - host_check_math_op_all_loaders(divides(), n, first_args, second_args); + host_check_math_op_all_loaders(floors(), n, first_args); + host_check_math_op_all_loaders(ceils(), n, first_args); + host_check_math_op_all_loaders(rounds(), n, first_args); + host_check_math_op_all_loaders(truncates(), n, first_args); - host_check_math_op_all_loaders(absolutes(), n, first_args); + // TODO: Place fallback implementations for all simd integer types + if constexpr (std::is_floating_point_v) { + host_check_math_op_all_loaders(divides(), n, first_args, second_args); + } } template @@ -100,20 +106,28 @@ inline void host_check_abi_size() { static_assert(simd_type::size() == mask_type::size()); } -template +template inline void host_check_math_ops() { constexpr size_t n = 11; host_check_abi_size(); - if constexpr (std::is_signed_v) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + if constexpr (!std::is_integral_v) { + DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, + -2.0, 10.0, 0.0, 1.2, -2.8}; + DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, + -2.4, 1.0, 13.0, -3.2, -2.1}; host_check_all_math_ops(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; - host_check_all_math_ops(first_args, second_args); + if constexpr (std::is_signed_v) { + DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + host_check_all_math_ops(first_args, second_args); + } else { + DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; + host_check_all_math_ops(first_args, second_args); + } } } @@ -171,11 +185,12 @@ KOKKOS_INLINE_FUNCTION void device_check_math_op_one_loader(UnaryOp unary_op, simd_type arg; bool const loaded_arg = loader.device_load(args + i, nlanes, arg); if (!loaded_arg) continue; - simd_type expected_result; + auto computed_result = unary_op.on_device(arg); + + decltype(computed_result) expected_result; for (std::size_t lane = 0; lane < nlanes; ++lane) { expected_result[lane] = unary_op.on_device_serial(arg[lane]); } - simd_type const computed_result = unary_op.on_device(arg); device_check_equality(expected_result, computed_result, nlanes); } } @@ -196,12 +211,17 @@ KOKKOS_INLINE_FUNCTION void device_check_all_math_ops( device_check_math_op_all_loaders(minus(), n, first_args, second_args); device_check_math_op_all_loaders(multiplies(), n, first_args, second_args); + device_check_math_op_all_loaders(absolutes(), n, first_args); - if constexpr (std::is_same_v) + device_check_math_op_all_loaders(floors(), n, first_args); + device_check_math_op_all_loaders(ceils(), n, first_args); + device_check_math_op_all_loaders(rounds(), n, first_args); + device_check_math_op_all_loaders(truncates(), n, first_args); + + if constexpr (std::is_floating_point_v) { device_check_math_op_all_loaders(divides(), n, first_args, second_args); - - device_check_math_op_all_loaders(absolutes(), n, first_args); + } } template @@ -217,14 +237,22 @@ KOKKOS_INLINE_FUNCTION void device_check_math_ops() { device_check_abi_size(); - if constexpr (std::is_signed_v) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + if constexpr (!std::is_integral_v) { + DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, + -2.0, 10.0, 0.0, 1.2, -2.8}; + DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, + -2.4, 1.0, 13.0, -3.2, -2.1}; device_check_all_math_ops(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; - device_check_all_math_ops(first_args, second_args); + if constexpr (std::is_signed_v) { + DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + device_check_all_math_ops(first_args, second_args); + } else { + DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; + device_check_all_math_ops(first_args, second_args); + } } } From 04a631081d9b7b10e58b7341cb476315ba9b5b37 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 12 Oct 2023 12:44:46 -0700 Subject: [PATCH 061/432] Update CI in OpenMPTarget to use llvm-17 (#6472) * OpenMPTarget: Update CI to clang-17. * OpenMPTarget: Update clang version for CI. * OpenMPTarget: update cmake version in CI. * OpenMPTarget: Disable reducers_size_t. * OpenMPTarget: Block non working unit test with clang/17. * OpenMPTarget: Update cmake version in CI. * Apply suggestions from code review Co-authored-by: Daniel Arndt --------- Co-authored-by: Rahulkumar Gayatri Co-authored-by: Christian Trott Co-authored-by: Daniel Arndt --- algorithms/unit_tests/CMakeLists.txt | 7 ++++++ core/unit_test/CMakeLists.txt | 35 +++++++++++++++++++++++++- core/unit_test/TestReducers.hpp | 3 ++- core/unit_test/TestReducers_b.hpp | 4 +++ scripts/docker/Dockerfile.openmptarget | 4 +-- 5 files changed, 49 insertions(+), 4 deletions(-) diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt index d866b4e250c..419f5ec1d13 100644 --- a/algorithms/unit_tests/CMakeLists.txt +++ b/algorithms/unit_tests/CMakeLists.txt @@ -348,6 +348,13 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) endif() endforeach() +# FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) + list(REMOVE_ITEM ALGO_SORT_SOURCES + TestSort.cpp + ) +endif() + # FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases # the impl needs to use either Kokkos or tailored reducers # which results in runtime memory errors. diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 73ef015dbfe..b71c72c3c9f 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -45,8 +45,11 @@ SET(KOKKOS_OPENMP_FEATURE_LEVEL 999) SET(KOKKOS_OPENMP_NAME OpenMP) # FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend. +# FIXME_OPENMPTARGET - Clang version 17 fails to compile incremental tests past 12 with verion 17. There is PR for this in upstream already. So it should be fixed by version 18. IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10) +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.0.0) + SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 12) ELSE() SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14) ENDIF() @@ -362,13 +365,28 @@ foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL endif() endforeach() - +# Disable non-compiling tests based on clang version. if(Kokkos_ENABLE_OPENMPTARGET) list(REMOVE_ITEM OpenMPTarget_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp + IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + endif() + IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp + endif() # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler. # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures. IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) @@ -379,6 +397,21 @@ if(Kokkos_ENABLE_OPENMPTARGET) ) endif() +# FIXME_OPENMPTARGET - MinMaxClamp fails even with the host backend when OpenMPTarget backend is enabled. +# FIXME_OPENMPTARGET - Unsure of the reason as of now. +IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) + IF(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_OPENMP) + list(REMOVE_ITEM OpenMP_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_MinMaxClamp.cpp + ) + ENDIF() + IF(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_SERIAL) + list(REMOVE_ITEM Serial_SOURCES1 + ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_MinMaxClamp.cpp + ) + ENDIF() +ENDIF() + if(Kokkos_ENABLE_OPENACC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index e9e86752eb7..957b9a0ca1a 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -1210,7 +1210,8 @@ struct TestReducers { // FIXME_OPENMPTARGET - The minmaxloc test fails llvm < 13 version, // test_minmaxloc_2d requires custom reductions #if defined(KOKKOS_ENABLE_OPENMPTARGET) -#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) && \ + (KOKKOS_COMPILER_CLANG <= 1700) test_minmaxloc(10007); #else if (!std::is_same_v) diff --git a/core/unit_test/TestReducers_b.hpp b/core/unit_test/TestReducers_b.hpp index a55870776e4..dedd161f6a1 100644 --- a/core/unit_test/TestReducers_b.hpp +++ b/core/unit_test/TestReducers_b.hpp @@ -16,8 +16,12 @@ #include +// FIXME_OPENMPTARGET - Fails at runtime post clang/16 +#if defined(KOKKOS_ENABLE_OPENMPTARGER) && defined(KOKKOS_COMPILER_CLANG) && \ + (KOKKOS_COMPILER_CLANG >= 1600) namespace Test { TEST(TEST_CATEGORY, reducers_size_t) { TestReducers::execute_integer(); } } // namespace Test +#endif diff --git a/scripts/docker/Dockerfile.openmptarget b/scripts/docker/Dockerfile.openmptarget index 44c53fef1dd..708cf533b8a 100644 --- a/scripts/docker/Dockerfile.openmptarget +++ b/scripts/docker/Dockerfile.openmptarget @@ -23,7 +23,7 @@ RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \ rm ${KEYDUMP_FILE}* -ARG CMAKE_VERSION=3.18.5 +ARG CMAKE_VERSION=3.27.7 ENV CMAKE_DIR=/opt/cmake RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ @@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO rm ${CMAKE_SCRIPT} ENV PATH=${CMAKE_DIR}/bin:$PATH -ARG LLVM_VERSION=llvmorg-15.0.0 +ARG LLVM_VERSION=llvmorg-17.0.1 ENV LLVM_DIR=/opt/llvm RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\ LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\ From a0cacc3055e0629fd3243b855cdee5f42a2d2cd5 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 9 Oct 2023 15:11:04 -0400 Subject: [PATCH 062/432] Rename Kokkos_ThreadsTeam.hpp to Kokkos_Threads_Team.hpp --- .../Threads/{Kokkos_ThreadsTeam.hpp => Kokkos_Threads_Team.hpp} | 0 core/src/decl/Kokkos_Declare_THREADS.hpp | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename core/src/Threads/{Kokkos_ThreadsTeam.hpp => Kokkos_Threads_Team.hpp} (100%) diff --git a/core/src/Threads/Kokkos_ThreadsTeam.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp similarity index 100% rename from core/src/Threads/Kokkos_ThreadsTeam.hpp rename to core/src/Threads/Kokkos_Threads_Team.hpp diff --git a/core/src/decl/Kokkos_Declare_THREADS.hpp b/core/src/decl/Kokkos_Declare_THREADS.hpp index ffb44340453..4d7caec6f5f 100644 --- a/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #endif From 78c1ed8855fe46ebc050637638f21afa6d1a9c70 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 13 Oct 2023 11:59:30 -0600 Subject: [PATCH 063/432] Kokkos_SIMD_Scalar.hpp: remove extra ';' Resolve -Werror=pedantic errors in nightly gcc builds --- simd/src/Kokkos_SIMD_Scalar.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/simd/src/Kokkos_SIMD_Scalar.hpp b/simd/src/Kokkos_SIMD_Scalar.hpp index af7cb1e2c61..7443f5596b5 100644 --- a/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/simd/src/Kokkos_SIMD_Scalar.hpp @@ -224,7 +224,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Kokkos::floor(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil( @@ -232,7 +232,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Kokkos::ceil(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round( @@ -240,7 +240,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Experimental::round_half_to_nearest_even(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc( @@ -248,7 +248,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Kokkos::trunc(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION From 7b86b80a9f0830a73df66864755776836e5448bb Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Mon, 16 Oct 2023 13:30:32 -0600 Subject: [PATCH 064/432] add guards --- algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp | 5 +++++ algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp index dbf637b2c91..3c1e2474bc9 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp @@ -149,6 +149,11 @@ KOKKOS_FUNCTION OutputIterator copy_if_team_impl( // no barrier needed because of the scan accumulating into count return d_first + count; } + +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) + __builtin_unreachable(); +#endif } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp index 2a82461e614..c7c29302786 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp @@ -174,6 +174,11 @@ KOKKOS_FUNCTION OutputIterator unique_copy_team_impl( return Impl::copy_team_impl(teamHandle, first + scan_size, last, d_first + count); } + +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) + __builtin_unreachable(); +#endif } } From b26a1f7356a3bfb0eddae11409264eea87ab8a06 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Tue, 17 Oct 2023 01:17:40 -0600 Subject: [PATCH 065/432] avoid auto --- containers/src/Kokkos_Bitset.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/containers/src/Kokkos_Bitset.hpp b/containers/src/Kokkos_Bitset.hpp index 8be327b0492..cd5ca4ea512 100644 --- a/containers/src/Kokkos_Bitset.hpp +++ b/containers/src/Kokkos_Bitset.hpp @@ -33,9 +33,10 @@ namespace Impl { template auto with_updated_label(const ViewCtorProp& view_ctor_prop, const std::string& label) { + using vcp_t = ViewCtorProp; //! If the label property is already set, append. Otherwise, set label. - if constexpr (ViewCtorProp::has_label) { - auto new_ctor_props(view_ctor_prop); + if constexpr (vcp_t::has_label) { + vcp_t new_ctor_props(view_ctor_prop); static_cast&>(new_ctor_props) .value.append(label); return new_ctor_props; From 629135a0f05a09bb0ec31c1ff27b507f5526807e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 17 Oct 2023 16:32:12 -0400 Subject: [PATCH 066/432] [ci skip] Update Kokkos version to 4.2.99 --- CMakeLists.txt | 2 +- Makefile.kokkos | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0e98324716..f6d3ab4e29d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,7 +150,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 1) +set(Kokkos_VERSION_MINOR 2) set(Kokkos_VERSION_PATCH 99) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") diff --git a/Makefile.kokkos b/Makefile.kokkos index 489ade2d575..e5d5d865ccc 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1,7 +1,7 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 1 +KOKKOS_VERSION_MINOR = 2 KOKKOS_VERSION_PATCH = 99 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) From e6c51df7f4efa107222ce4ed7207936211139d3e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 17 Oct 2023 20:49:56 -0700 Subject: [PATCH 067/432] [deprecated code 3] remove all default device init tests --- core/unit_test/CMakeLists.txt | 9 - core/unit_test/TestDefaultDeviceTypeInit.hpp | 491 ------------------ .../default/TestDefaultDeviceTypeInit_1.cpp | 18 - .../default/TestDefaultDeviceTypeInit_10.cpp | 18 - .../default/TestDefaultDeviceTypeInit_11.cpp | 18 - .../default/TestDefaultDeviceTypeInit_12.cpp | 18 - .../default/TestDefaultDeviceTypeInit_13.cpp | 18 - .../default/TestDefaultDeviceTypeInit_14.cpp | 18 - .../default/TestDefaultDeviceTypeInit_15.cpp | 18 - .../default/TestDefaultDeviceTypeInit_16.cpp | 18 - .../default/TestDefaultDeviceTypeInit_17.cpp | 18 - .../default/TestDefaultDeviceTypeInit_18.cpp | 18 - .../default/TestDefaultDeviceTypeInit_2.cpp | 18 - .../default/TestDefaultDeviceTypeInit_3.cpp | 18 - .../default/TestDefaultDeviceTypeInit_4.cpp | 18 - .../default/TestDefaultDeviceTypeInit_5.cpp | 18 - .../default/TestDefaultDeviceTypeInit_6.cpp | 18 - .../default/TestDefaultDeviceTypeInit_7.cpp | 18 - .../default/TestDefaultDeviceTypeInit_8.cpp | 18 - .../default/TestDefaultDeviceTypeInit_9.cpp | 18 - 20 files changed, 824 deletions(-) delete mode 100644 core/unit_test/TestDefaultDeviceTypeInit.hpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index b71c72c3c9f..a2d40e4fc97 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -1167,15 +1167,6 @@ KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest ) endif() -if(Kokkos_ENABLE_DEPRECATED_CODE_3) - foreach(INITTESTS_NUM RANGE 1 18) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_DefaultInit_${INITTESTS_NUM} - SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp - ) - endforeach(INITTESTS_NUM) -endif() - if (KOKKOS_ENABLE_HWLOC) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HWLOC diff --git a/core/unit_test/TestDefaultDeviceTypeInit.hpp b/core/unit_test/TestDefaultDeviceTypeInit.hpp deleted file mode 100644 index 929c91db4e0..00000000000 --- a/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ /dev/null @@ -1,491 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include - -#include - -#ifdef KOKKOS_ENABLE_OPENMP -#include -#endif -#include -#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) - -namespace Test { - -namespace Impl { - -std::set delete_these; -void cleanup_memory() { - for (auto x : delete_these) { - delete[] x; - } -} - -char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, - bool do_other, bool do_tune, int& nargs, - Kokkos::InitArguments& init_args) { - nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + - (do_other ? 4 : 0) + (do_tune ? 1 : 0); - - char** args_kokkos = new char*[nargs]; - const int max_args_size = 45; - for (int i = 0; i < nargs; i++) { - args_kokkos[i] = new char[max_args_size]; - delete_these.insert(args_kokkos[i]); - } - - int threads_idx = do_other ? 1 : 0; - int numa_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0); - int device_idx = - (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0); - int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + - (do_device ? 1 : 0); - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - init_args.num_threads = nthreads; - snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads); - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - init_args.num_numa = numa; - snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa); - } - - if (do_device) { - init_args.device_id = 0; - snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0); - } - - if (do_other) { - snprintf(args_kokkos[0], max_args_size, "--dummyarg=1"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size, - "--dummy2arg"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size, - "dummy3arg"); - snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size, - "dummy4arg=1"); - } - - if (do_tune) { - init_args.tune_internals = true; - snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals"); - } - - return args_kokkos; -} - -Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, - bool do_device, bool do_tune) { - Kokkos::InitArguments args; - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) { - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - args.num_threads = nthreads; - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - args.num_numa = numa; - } - - if (do_device) { - args.device_id = 0; - } - - if (do_tune) { - args.tune_internals = true; - } - - return args; -} - -void check_correct_initialization(const Kokkos::InitArguments& argstruct) { - ASSERT_EQ(Kokkos::DefaultExecutionSpace::impl_is_initialized(), 1); - ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_is_initialized(), 1); - - // Figure out the number of threads the HostSpace ExecutionSpace should have - // initialized to. - int expected_nthreads = argstruct.num_threads; - -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - // use openmp default num threads - if (expected_nthreads < 0 || - (expected_nthreads == 0 && !Kokkos::hwloc::available())) { - expected_nthreads = omp_get_max_threads(); - } - // use hwloc if available - else if (expected_nthreads == 0 && Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - } -#endif - - if (expected_nthreads < 1) { - if (Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } else { - expected_nthreads = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - expected_nthreads = 1; - } -#endif - -#ifdef KOKKOS_ENABLE_HPX - // HPX uses all cores on machine by default. Skip this test. - if (std::is_same::value || - std::is_same::value) { - return; - } -#endif - } - - int expected_numa = argstruct.num_numa; - - if (expected_numa < 1) { - if (Kokkos::hwloc::available()) { - expected_numa = Kokkos::hwloc::get_available_numa_count(); - } else { - expected_numa = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) - expected_numa = 1; -#endif - } - - ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(), - expected_nthreads); - -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - int device; - cudaGetDevice(&device); - - int expected_device = argstruct.device_id; - if (argstruct.device_id < 0) { - expected_device = Kokkos::Cuda().cuda_device(); - } - - ASSERT_EQ(expected_device, device); - } -#endif - ASSERT_EQ(argstruct.tune_internals, Kokkos::tune_internals()); -} - -// TODO: Add check whether correct number of threads are actually started. -void test_no_arguments() { - Kokkos::initialize(); - check_correct_initialization(Kokkos::InitArguments()); - Kokkos::finalize(); -} - -void test_commandline_args(int nargs, char** args, - const Kokkos::InitArguments& argstruct) { - Kokkos::initialize(nargs, args); - check_correct_initialization(argstruct); - Kokkos::finalize(); -} - -void test_initstruct_args(const Kokkos::InitArguments& args) { - Kokkos::initialize(args); - check_correct_initialization(args); - Kokkos::finalize(); -} - -} // namespace Impl - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -TEST(defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); } -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -TEST(defaultdevicetypeinit, commandline_args_empty) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -TEST(defaultdevicetypeinit, commandline_args_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, true, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -TEST(defaultdevicetypeinit, commandline_args_nthreads) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(true, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, false, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -TEST(defaultdevicetypeinit, commandline_args_nthreads_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, false, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -TEST(defaultdevicetypeinit, commandline_args_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(false, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -TEST(defaultdevicetypeinit, commandline_args_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, true, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other_tune) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, true, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -TEST(defaultdevicetypeinit, initstruct_default) { - Kokkos::InitArguments args; - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -TEST(defaultdevicetypeinit, initstruct_nthreads) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -TEST(defaultdevicetypeinit, initstruct_device) { - Kokkos::InitArguments args = Impl::init_initstruct(false, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -TEST(defaultdevicetypeinit, initstruct_nthreads_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device_tune) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, true); - Impl::test_initstruct_args(args); -} -#endif - -} // namespace Test - -#endif diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp deleted file mode 100644 index 348b9feeab0..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp deleted file mode 100644 index a77a55ea653..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp deleted file mode 100644 index 1b6a140920c..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp deleted file mode 100644 index 316bc85526f..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp deleted file mode 100644 index 6344960a1cf..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp deleted file mode 100644 index 4515174b82b..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp deleted file mode 100644 index 7ead50f0944..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp deleted file mode 100644 index e12b9b3894a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp deleted file mode 100644 index 959d0ab7503..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp deleted file mode 100644 index 07d841519dc..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp deleted file mode 100644 index 042a515b16a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp deleted file mode 100644 index dba401e5bcf..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp deleted file mode 100644 index a44c58bdb55..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp deleted file mode 100644 index cac0841dd83..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp deleted file mode 100644 index bafe3b3fd2a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp deleted file mode 100644 index 3a4dd9d2533..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp deleted file mode 100644 index 4e92aae565a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp deleted file mode 100644 index 44b8f3428d9..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -#include From dfd0a6d318aef7a09eedeca1b9406672fd161fac Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 17 Oct 2023 20:52:42 -0700 Subject: [PATCH 068/432] [deprecated code 3] remove InitArguments --- core/src/Kokkos_Core_fwd.hpp | 3 - .../impl/Kokkos_InitializationSettings.hpp | 61 ------------------- core/unit_test/TestInitializationSettings.cpp | 24 -------- 3 files changed, 88 deletions(-) diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index 44f1c5b42f4..b8a07440f34 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -75,9 +75,6 @@ template struct Device; // forward declare here so that backend initializer calls can use it. -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments; -#endif class InitializationSettings; } // namespace Kokkos diff --git a/core/src/impl/Kokkos_InitializationSettings.hpp b/core/src/impl/Kokkos_InitializationSettings.hpp index ab4350f3a7a..d5732f284bc 100644 --- a/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/core/src/impl/Kokkos_InitializationSettings.hpp @@ -24,32 +24,6 @@ namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments { - int num_threads; - int num_numa; - int device_id; - int ndevices; - int skip_device; - bool disable_warnings; - bool tune_internals; - bool tool_help = false; - std::string tool_lib = {}; - std::string tool_args = {}; - - KOKKOS_DEPRECATED_WITH_COMMENT("Use InitializationSettings instead!") - InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, - bool ti = false) - : num_threads{nt}, - num_numa{nn}, - device_id{dv}, - ndevices{-1}, - skip_device{9999}, - disable_warnings{dw}, - tune_internals{ti} {} -}; -#endif - class InitializationSettings { #define KOKKOS_IMPL_DECLARE(TYPE, NAME) \ private: \ @@ -80,41 +54,6 @@ class InitializationSettings { #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER #undef KOKKOS_IMPL_DECLARE - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - public: - InitializationSettings() = default; - - InitializationSettings(InitArguments const& old) { - if (old.num_threads != -1) { - set_num_threads(old.num_threads); - } - if (old.device_id != -1) { - set_device_id(old.device_id); - } - if (old.ndevices != -1) { - set_num_devices(old.ndevices); - } - if (old.skip_device != 9999) { - set_skip_device(old.skip_device); - } - if (old.disable_warnings) { - set_disable_warnings(true); - } - if (old.tune_internals) { - set_tune_internals(true); - } - if (old.tool_help) { - set_tools_help(true); - } - if (!old.tool_lib.empty()) { - set_tools_libs(old.tool_lib); - } - if (!old.tool_args.empty()) { - set_tools_args(old.tool_args); - } - } -#endif }; } // namespace Kokkos diff --git a/core/unit_test/TestInitializationSettings.cpp b/core/unit_test/TestInitializationSettings.cpp index f5be0e47aab..08eddc79e5d 100644 --- a/core/unit_test/TestInitializationSettings.cpp +++ b/core/unit_test/TestInitializationSettings.cpp @@ -20,30 +20,6 @@ namespace { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void take_initialization_settings(Kokkos::InitializationSettings const&) {} - -TEST(defaultdevicetype, - init_arguments_implicit_conversion_to_initialization_settings) { - Kokkos::InitArguments arguments; - take_initialization_settings(arguments); // check that conversion is implicit - arguments.device_id = 1; - arguments.tune_internals = true; - Kokkos::InitializationSettings settings{arguments}; - EXPECT_FALSE(settings.has_num_threads()); - EXPECT_TRUE(settings.has_device_id()); - EXPECT_EQ(settings.get_device_id(), 1); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); - EXPECT_FALSE(settings.has_disable_warnings()); - EXPECT_TRUE(settings.has_tune_internals()); - EXPECT_TRUE(settings.get_tune_internals()); - EXPECT_FALSE(settings.has_tools_help()); - EXPECT_FALSE(settings.has_tools_libs()); - EXPECT_FALSE(settings.has_tools_args()); -} -#endif - TEST(defaultdevicetype, initialization_settings) { auto const settings = Kokkos::InitializationSettings() .set_num_threads(255) From 57c0aa61f68198982ac9b08ae4f487992683f0cb Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 17 Oct 2023 21:06:52 -0700 Subject: [PATCH 069/432] [deprecated code 3] remove KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_* macros --- core/src/Kokkos_Macros.hpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 3cf7ac4fa24..d32ab2e57b6 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -433,22 +433,6 @@ #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL #endif -//---------------------------------------------------------------------------- -// Determine for what space the code is being compiled: -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) - -#if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA -#elif defined(__SYCL_DEVICE_ONLY__) && defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL -#elif defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ - defined(KOKKOS_ENABLE_HIP) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU -#else -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST -#endif - -#endif //---------------------------------------------------------------------------- // Remove surrounding parentheses if present From 35dda2ac657d3e233aea5389ca04b142ef900036 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 09:02:33 -0400 Subject: [PATCH 070/432] [deprecated code 3] remove using declaration in Kokkos::Experimental:: for clamp, min, max, and minmax --- core/src/Kokkos_MinMaxClamp.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/core/src/Kokkos_MinMaxClamp.hpp b/core/src/Kokkos_MinMaxClamp.hpp index 37a28a80b68..09ae9689f62 100644 --- a/core/src/Kokkos_MinMaxClamp.hpp +++ b/core/src/Kokkos_MinMaxClamp.hpp @@ -199,15 +199,6 @@ KOKKOS_INLINE_FUNCTION constexpr Kokkos::pair minmax( return result; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Experimental { -using ::Kokkos::clamp; -using ::Kokkos::max; -using ::Kokkos::min; -using ::Kokkos::minmax; -} // namespace Experimental -#endif - } // namespace Kokkos #endif From d515a51ea317ae1efe90952ce1a5398c73044fa1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 09:03:06 -0400 Subject: [PATCH 071/432] [deprecated code 3] remove using declaration in Kokkos::Experimental:: for all math functions --- core/src/Kokkos_MathematicalFunctions.hpp | 39 ++--------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index ee64c67b93b..f1d7bdd0ba9 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -92,16 +92,6 @@ using promote_3_t = typename promote_3::type; #endif #endif -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE -#else -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - /* nothing */ -#endif - #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ @@ -128,13 +118,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } // isinf, isnan, and isinfinite do not work on Windows with CUDA with std:: // getting warnings about calling host function in device function then @@ -152,8 +136,6 @@ using promote_3_t = typename promote_3::type; T x) { \ return ::FUNC(static_cast(x)); \ } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) #else #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ KOKKOS_INLINE_FUNCTION bool FUNC(float x) { \ @@ -174,8 +156,6 @@ using promote_3_t = typename promote_3::type; using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) #endif #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ @@ -221,13 +201,7 @@ using promote_3_t = typename promote_3::type; static_assert(std::is_same_v, ""); \ using std::FUNC; \ return FUNC(static_cast(x), static_cast(y)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } #define KOKKOS_IMPL_MATH_TERNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, float z) { \ @@ -314,8 +288,6 @@ inline long double abs(long double x) { using std::abs; return abs(x); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { using ::Kokkos::abs; }) KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod) KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder) @@ -336,12 +308,6 @@ KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); } KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); } #endif inline long double nanl(char const* arg) { return ::nanl(arg); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { - using ::Kokkos::nan; - using ::Kokkos::nanf; - using ::Kokkos::nanl; - }) // Exponential functions KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp) // FIXME_NVHPC nvc++ has issues with exp2 @@ -478,7 +444,6 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit) // islessgreater // isunordered -#undef KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE From 3172fd1b04c8fa9fa9bc037e92437bf3299f78df Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 09:03:18 -0400 Subject: [PATCH 072/432] [deprecated code 3] remove using declaration in Kokkos::Experimental:: for all math constants --- core/src/Kokkos_MathematicalConstants.hpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/core/src/Kokkos_MathematicalConstants.hpp b/core/src/Kokkos_MathematicalConstants.hpp index 51a50d347de..1a77f373fd8 100644 --- a/core/src/Kokkos_MathematicalConstants.hpp +++ b/core/src/Kokkos_MathematicalConstants.hpp @@ -51,24 +51,6 @@ KOKKOS_IMPL_MATH_CONSTANT(phi, 1.618033988749894848204586834365638118L); } // namespace Kokkos::numbers -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Kokkos::Experimental { -using Kokkos::numbers::e_v; -using Kokkos::numbers::egamma_v; -using Kokkos::numbers::inv_pi_v; -using Kokkos::numbers::inv_sqrt3_v; -using Kokkos::numbers::inv_sqrtpi_v; -using Kokkos::numbers::ln10_v; -using Kokkos::numbers::ln2_v; -using Kokkos::numbers::log10e_v; -using Kokkos::numbers::log2e_v; -using Kokkos::numbers::phi_v; -using Kokkos::numbers::pi_v; -using Kokkos::numbers::sqrt2_v; -using Kokkos::numbers::sqrt3_v; -} // namespace Kokkos::Experimental -#endif - #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS From 0505ce2940d0491c46afcc33038effe94807583f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 09:08:07 -0400 Subject: [PATCH 073/432] [deprecated code 3] remove {OpenMP,HPX}::partition_master --- core/src/HPX/Kokkos_HPX.hpp | 12 -- core/src/OpenMP/Kokkos_OpenMP.hpp | 12 -- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 44 -------- core/unit_test/CMakeLists.txt | 1 - .../openmp/TestOpenMP_PartitionMaster.cpp | 105 ------------------ 5 files changed, 174 deletions(-) delete mode 100644 core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index 1dfc5b40646..9438a817408 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -248,18 +248,6 @@ class HPX { #endif } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - template - KOKKOS_DEPRECATED static void partition_master( - F const &, int requested_num_partitions = 0, int = 0) { - if (requested_num_partitions > 1) { - Kokkos::abort( - "Kokkos::Experimental::HPX::partition_master: can't partition an " - "HPX instance\n"); - } - } -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); #else diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index 594f40d5245..4bd9a3a1cc1 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -104,18 +104,6 @@ class OpenMP { /// This always returns false on OpenMP inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - /// \brief Partition the default instance and call 'f' on each new 'master' - /// thread - /// - /// Func is a functor with the following signiture - /// void( int partition_id, int num_partitions ) - template - KOKKOS_DEPRECATED static void partition_master( - F const& f, int requested_num_partitions = 0, - int requested_partition_size = 0); -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); #else diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 03f5fff395a..301a1d533af 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -202,50 +202,6 @@ std::vector partition_space(OpenMP const& main_instance, return Impl::create_OpenMP_instances(main_instance, weights); } } // namespace Experimental - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template -KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, - int partition_size) { -#if _OPENMP >= 201511 - if (omp_get_max_active_levels() > 1) { -#else - if (omp_get_nested()) { -#endif - using Exec = Impl::OpenMPInternal; - - Exec* prev_instance = &Impl::OpenMPInternal::singleton(); - - Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, - partition_size); - - OpenMP::memory_space space; - -#pragma omp parallel num_threads(num_partitions) - { - Exec thread_local_instance(partition_size); - Impl::t_openmp_instance = &thread_local_instance; - - size_t pool_reduce_bytes = 32 * partition_size; - size_t team_reduce_bytes = 32 * partition_size; - size_t team_shared_bytes = 1024 * partition_size; - size_t thread_local_bytes = 1024; - - thread_local_instance.resize_thread_data( - pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, - thread_local_bytes); - - omp_set_num_threads(partition_size); - f(omp_get_thread_num(), omp_get_num_threads()); - Impl::t_openmp_instance = nullptr; - } - } else { - // nested openmp not enabled - f(0, 1); - } -} -#endif - } // namespace Kokkos #endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index a2d40e4fc97..413c699be04 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -677,7 +677,6 @@ endif() if (Kokkos_ENABLE_OPENMP) set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp - openmp/TestOpenMP_PartitionMaster.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_OpenMP diff --git a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp deleted file mode 100644 index 92b8032bf0c..00000000000 --- a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp +++ /dev/null @@ -1,105 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include - -#include - -namespace Test { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -TEST(openmp, partition_master) { - using Mutex = Kokkos::Experimental::MasterLock; - - Mutex mtx; - int errors = 0; - - auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) { - const int pool_size = Kokkos::OpenMP().impl_thread_pool_size(); - - { - std::unique_lock lock(mtx); - if (Kokkos::OpenMP::in_parallel()) { - ++errors; - } - if (Kokkos::OpenMP::impl_thread_pool_rank() != 0) { - ++errors; - } - } - - { - int local_errors = 0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, 1000), - [pool_size](const int, int& errs) { - if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) { - ++errs; - } - }, - local_errors); - Kokkos::atomic_add(&errors, local_errors); - } - - Kokkos::Experimental::UniqueToken token; - - Kokkos::View count("", token.size()); - - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1000), - [=](const int) { - int i = token.acquire(); - ++count[i]; - token.release(i); - }); - - Kokkos::View sum(""); - Kokkos::parallel_for( - Kokkos::RangePolicy(0, token.size()), - [=](const int i) { Kokkos::atomic_add(sum.data(), count[i]); }); - - if (sum() != 1000) { - Kokkos::atomic_add(&errors, 1); - } - }; - - master(0, 1); - - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 4, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 4); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 2, 2); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 8); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 8); - ASSERT_EQ(errors, 0); -} -#endif - -} // namespace Test From ca49c65f6bde19e720090dddca47f7d41907c3b8 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 09:47:00 -0400 Subject: [PATCH 074/432] OpenMP backend cleanup following removal of deprecated code 3 --- core/src/OpenMP/Kokkos_OpenMP.cpp | 15 ----- core/src/OpenMP/Kokkos_OpenMP.hpp | 10 ---- core/src/OpenMP/Kokkos_OpenMP_Instance.cpp | 55 ------------------- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 5 -- .../src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp | 25 +-------- .../OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp | 24 -------- .../OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp | 16 ------ 7 files changed, 1 insertion(+), 149 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index 9a169a435c7..245e1bfb3af 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -82,28 +82,13 @@ bool OpenMP::impl_is_initialized() noexcept { } bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return ( - (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) && - (!Impl::t_openmp_instance || - Impl::t_openmp_instance->m_level < omp_get_level())); -#else return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); -#endif } int OpenMP::impl_thread_pool_size() const noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel(*this) - ? omp_get_num_threads() - : (Impl::t_openmp_instance - ? Impl::t_openmp_instance->m_pool_size - : impl_internal_space_instance()->m_pool_size); -#else return OpenMP::in_parallel(*this) ? omp_get_num_threads() : impl_internal_space_instance()->m_pool_size; -#endif } int OpenMP::impl_max_hardware_threads() noexcept { diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index 4bd9a3a1cc1..9d20a611139 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -53,11 +53,6 @@ namespace Kokkos { namespace Impl { class OpenMPInternal; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -// FIXME_OPENMP we can remove this after we remove partition_master -inline thread_local OpenMPInternal* t_openmp_instance = nullptr; -#endif } // namespace Impl /// \class OpenMP @@ -156,12 +151,7 @@ class OpenMP { inline int OpenMP::impl_thread_pool_rank() noexcept { // FIXME_OPENMP Can we remove this when removing partition_master? It's only // used in one partition_master test -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - KOKKOS_IF_ON_HOST( - (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();)) -#else KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) -#endif KOKKOS_IF_ON_DEVICE((return -1;)) } diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 44f0fbc180a..3038345b269 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -47,61 +47,6 @@ void OpenMPInternal::release_lock() { desul::MemoryScopeDevice()); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void OpenMPInternal::validate_partition_impl(const int nthreads, - int &num_partitions, - int &partition_size) { - if (nthreads == 1) { - num_partitions = 1; - partition_size = 1; - } else if (num_partitions < 1 && partition_size < 1) { - int idle = nthreads; - for (int np = 2; np <= nthreads; ++np) { - for (int ps = 1; ps <= nthreads / np; ++ps) { - if (nthreads - np * ps < idle) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } else if (num_partitions < 1 && partition_size > 0) { - if (partition_size <= nthreads) { - num_partitions = nthreads / partition_size; - } else { - num_partitions = 1; - partition_size = nthreads; - } - } else if (num_partitions > 0 && partition_size < 1) { - if (num_partitions <= nthreads) { - partition_size = nthreads / num_partitions; - } else { - num_partitions = nthreads; - partition_size = 1; - } - } else if (num_partitions * partition_size > nthreads) { - int idle = nthreads; - const int NP = num_partitions; - const int PS = partition_size; - for (int np = NP; np > 0; --np) { - for (int ps = PS; ps > 0; --ps) { - if ((np * ps <= nthreads) && (nthreads - np * ps < idle)) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } -} -#endif - void OpenMPInternal::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 301a1d533af..c95a793ecc7 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -99,11 +99,6 @@ class OpenMPInternal { // Release lock used to protect access to m_pool void release_lock(); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - static void validate_partition_impl(const int nthreads, int& num_partitions, - int& partition_size); -#endif - void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, size_t team_shared_bytes, size_t thread_local_bytes); diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp index 96dc664eb79..823a7e668e5 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp @@ -147,15 +147,7 @@ class ParallelFor, Kokkos::OpenMP> { inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -251,16 +243,9 @@ class ParallelFor, inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy) : m_instance(nullptr), m_iter(arg_policy, arg_functor) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } + template static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -409,15 +394,7 @@ class ParallelFor, m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize::value( arg_functor, arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp index 52cdef18e65..05fd1c9dce3 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp @@ -170,15 +170,7 @@ class ParallelReduce, m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_result_ptr(arg_view.data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, @@ -319,15 +311,7 @@ class ParallelReduce::accessible, @@ -543,15 +527,7 @@ class ParallelReduce::value( arg_functor_reducer.get_functor(), arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess, inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -292,15 +284,7 @@ class ParallelScanWithTotal, Kokkos::Impl::MemorySpaceAccess::accessible, "Kokkos::OpenMP parallel_scan result must be host-accessible!"); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } //---------------------------------------- From fb0bd529750a11d2f92ad12d14c9d30c2fd26277 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 10:21:07 -0400 Subject: [PATCH 075/432] Get rid of FIXME_OPENMP OpenMP::impl_thread_pool_rank() is used in UniqueToken::acquire() --- core/src/OpenMP/Kokkos_OpenMP.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index 9d20a611139..9ee2291c029 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -149,8 +149,6 @@ class OpenMP { }; inline int OpenMP::impl_thread_pool_rank() noexcept { - // FIXME_OPENMP Can we remove this when removing partition_master? It's only - // used in one partition_master test KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) KOKKOS_IF_ON_DEVICE((return -1;)) From 7c63c32bd367ea4ac1f1b39ea463e67221ceca54 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 10:22:53 -0400 Subject: [PATCH 076/432] [deprecated code 3] remove MasterLock --- core/src/Kokkos_Core_fwd.hpp | 4 -- core/src/Kokkos_MasterLock.hpp | 56 ---------------------- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 25 ---------- 3 files changed, 85 deletions(-) delete mode 100644 core/src/Kokkos_MasterLock.hpp diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index b8a07440f34..0c64352464a 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -30,10 +30,6 @@ #include #include -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -#include -#endif - //---------------------------------------------------------------------------- // Have assumed a 64-bit build (8-byte pointers) throughout the code base. // 32-bit build allowed but unsupported. diff --git a/core/src/Kokkos_MasterLock.hpp b/core/src/Kokkos_MasterLock.hpp deleted file mode 100644 index 1d09617371a..00000000000 --- a/core/src/Kokkos_MasterLock.hpp +++ /dev/null @@ -1,56 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MASTER_LOCK_HPP -#define KOKKOS_MASTER_LOCK_HPP - -#include - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - -namespace Kokkos { -namespace Experimental { - -// my be used to coordinate work between master instances -// SHOULD NOT be used within a parallel algorithm -// -// This lock should be used with with a scoped lock guard -// i.e. std::unique_lock, std::lock_guard -// -// cannot be copied or moved -// has the following functions available -// -// Lock() -// ~Lock() -// -// void lock() -// void unlock() -// bool try_lock() -// -template -class MasterLock; - -} // namespace Experimental -} // namespace Kokkos - -#endif - -#endif // KOKKOS_MASTER_LOCK_HPP diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index c95a793ecc7..583beee3252 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -119,31 +119,6 @@ class OpenMPInternal { } // namespace Impl -namespace Experimental { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template <> -class MasterLock { - public: - void lock() { omp_set_lock(&m_lock); } - void unlock() { omp_unset_lock(&m_lock); } - bool try_lock() { return static_cast(omp_test_lock(&m_lock)); } - - KOKKOS_DEPRECATED MasterLock() { omp_init_lock(&m_lock); } - ~MasterLock() { omp_destroy_lock(&m_lock); } - - MasterLock(MasterLock const&) = delete; - MasterLock(MasterLock&&) = delete; - MasterLock& operator=(MasterLock const&) = delete; - MasterLock& operator=(MasterLock&&) = delete; - - private: - omp_lock_t m_lock; -}; -#endif - -} // namespace Experimental - namespace Experimental { namespace Impl { // Partitioning an Execution Space: expects space and integer arguments for From 2e6765a2aa5c100a8553f2529019cd06f0da2921 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 10:24:00 -0400 Subject: [PATCH 077/432] [deprecated code 3] remove ENABLE_DEPRECATED_CODE_3 option --- .jenkins | 2 -- cmake/KokkosCore_config.h.in | 1 - cmake/kokkos_enable_options.cmake | 1 - 3 files changed, 4 deletions(-) diff --git a/.jenkins b/.jenkins index 6f5cf80033f..f5d0bfcf0e8 100644 --- a/.jenkins +++ b/.jenkins @@ -390,7 +390,6 @@ pipeline { -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ -DKokkos_ENABLE_CUDA_UVM=ON \ -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_IMPL_MDSPAN=ON \ @@ -493,7 +492,6 @@ pipeline { -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index bec59ebd034..8ef464be33c 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -45,7 +45,6 @@ #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK #cmakedefine KOKKOS_ENABLE_TUNING -#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_4 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 89e23b019bd..a36742e4dfb 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -48,7 +48,6 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda # resolved but we keep the option around a bit longer to be safe. KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major release 3 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") From 589ad55b0cac014beeb23e5479f931ede0a2b9ff Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 18 Oct 2023 10:41:35 -0400 Subject: [PATCH 078/432] fixup! [deprecated code 3] remove using declaration in Kokkos::Experimental:: for all math functions --- core/src/Kokkos_MathematicalFunctions.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index f1d7bdd0ba9..361d1317e94 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -135,7 +135,7 @@ using promote_3_t = typename promote_3::type; KOKKOS_INLINE_FUNCTION std::enable_if_t, bool> FUNC( \ T x) { \ return ::FUNC(static_cast(x)); \ - } \ + } #else #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ KOKKOS_INLINE_FUNCTION bool FUNC(float x) { \ @@ -155,7 +155,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ + } #endif #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ From 380754b91e2f8735c5fdfc736a1ce1b483c5ac38 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 17 Oct 2023 19:54:23 -0700 Subject: [PATCH 079/432] Do not append " - blocks" to the bitset label --- containers/src/Kokkos_Bitset.hpp | 23 ++----------------- .../src/impl/Kokkos_UnorderedMap_impl.hpp | 16 +++++++++++++ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/containers/src/Kokkos_Bitset.hpp b/containers/src/Kokkos_Bitset.hpp index cd5ca4ea512..9ccb52cf119 100644 --- a/containers/src/Kokkos_Bitset.hpp +++ b/containers/src/Kokkos_Bitset.hpp @@ -28,24 +28,6 @@ namespace Kokkos { -namespace Impl { -//! Either append to the label if the property already exists, or set it. -template -auto with_updated_label(const ViewCtorProp& view_ctor_prop, - const std::string& label) { - using vcp_t = ViewCtorProp; - //! If the label property is already set, append. Otherwise, set label. - if constexpr (vcp_t::has_label) { - vcp_t new_ctor_props(view_ctor_prop); - static_cast&>(new_ctor_props) - .value.append(label); - return new_ctor_props; - } else { - return Impl::with_properties_if_unset(view_ctor_prop, label); - } -} -} // namespace Impl - template class Bitset; @@ -108,9 +90,8 @@ class Bitset { "Allocation properties should not contain the 'pointer' property."); //! Update 'label' property and allocate. - const auto prop_copy = Kokkos::Impl::with_updated_label( - Impl::with_properties_if_unset(arg_prop, std::string("Bitset")), - " - blocks"); + const auto prop_copy = + Impl::with_properties_if_unset(arg_prop, std::string("Bitset")); m_blocks = block_view_type(prop_copy, ((m_size + block_mask) >> block_shift)); diff --git a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index 8f8cd9523b7..857d3271b10 100644 --- a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -27,6 +27,22 @@ namespace Kokkos { namespace Impl { +//! Either append to the label if the property already exists, or set it. +template +auto with_updated_label(const ViewCtorProp& view_ctor_prop, + const std::string& label) { + using vcp_t = ViewCtorProp; + //! If the label property is already set, append. Otherwise, set label. + if constexpr (vcp_t::has_label) { + vcp_t new_ctor_props(view_ctor_prop); + static_cast&>(new_ctor_props) + .value.append(label); + return new_ctor_props; + } else { + return Impl::with_properties_if_unset(view_ctor_prop, label); + } +} + uint32_t find_hash_size(uint32_t size); template From ef889a7ab04dfd2bc61ae018b6547c56c7c4c93a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 18 Oct 2023 12:02:24 -0400 Subject: [PATCH 080/432] with_updated_label -> append_to_label --- containers/src/Kokkos_UnorderedMap.hpp | 18 ++++++++--------- .../src/impl/Kokkos_UnorderedMap_impl.hpp | 20 ++++++++----------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index e001c062de3..4b3e9ce9386 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -337,27 +337,27 @@ class UnorderedMap { Impl::get_property(prop_copy) + " - size")); m_available_indexes = - bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"), + bitset_type(Kokkos::Impl::append_to_label(prop_copy, " - bitset"), calculate_capacity(capacity_hint)); m_hash_lists = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - hash list"), Impl::find_hash_size(capacity())); m_next_index = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - next index"), capacity() + 1); // +1 so that the *_at functions can always return a // valid reference - m_keys = key_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity()); + m_keys = key_type_view(Kokkos::Impl::append_to_label(prop_copy, " - keys"), + capacity()); - m_values = value_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - values"), - is_set ? 0 : capacity()); + m_values = + value_type_view(Kokkos::Impl::append_to_label(prop_copy, " - values"), + is_set ? 0 : capacity()); m_scalars = - scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars")); + scalars_view(Kokkos::Impl::append_to_label(prop_copy, " - scalars")); /** * Deep copies should also be done using the space instance if given. diff --git a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index 857d3271b10..a979ee40d8c 100644 --- a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -27,20 +27,16 @@ namespace Kokkos { namespace Impl { -//! Either append to the label if the property already exists, or set it. +//! Append to the label contained in view_ctor_prop. template -auto with_updated_label(const ViewCtorProp& view_ctor_prop, - const std::string& label) { +auto append_to_label(const ViewCtorProp& view_ctor_prop, + const std::string& label) { using vcp_t = ViewCtorProp; - //! If the label property is already set, append. Otherwise, set label. - if constexpr (vcp_t::has_label) { - vcp_t new_ctor_props(view_ctor_prop); - static_cast&>(new_ctor_props) - .value.append(label); - return new_ctor_props; - } else { - return Impl::with_properties_if_unset(view_ctor_prop, label); - } + static_assert(vcp_t::has_label); + vcp_t new_ctor_props(view_ctor_prop); + static_cast&>(new_ctor_props) + .value.append(label); + return new_ctor_props; } uint32_t find_hash_size(uint32_t size); From 2bc1721d7c7023a3386a36cdd06380dbc2f1846e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 18 Oct 2023 14:21:03 +0000 Subject: [PATCH 081/432] SYCL: Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL to detect support for device global variables --- cmake/kokkos_arch.cmake | 47 +++++++++++-------- .../include/desul/atomics/Adapt_SYCL.hpp | 7 ++- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index bccf674d763..924a2b8bcbf 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -588,32 +588,39 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() # Check support for device_global variables -# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is -# available, use that instead. +# FIXME_SYCL Even if SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we still can't +# use device global variables with shared libraries IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - CHECK_CXX_SOURCE_COMPILES(" - #include - using namespace sycl::ext::oneapi::experimental; - using namespace sycl; + INCLUDE(CheckCXXSymbolExists) + CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + ELSE() + INCLUDE(CheckCXXSourceCompiles) + CHECK_CXX_SOURCE_COMPILES(" + #include + using namespace sycl::ext::oneapi::experimental; + using namespace sycl; - SYCL_EXTERNAL device_global Foo; + SYCL_EXTERNAL device_global Foo; - void bar(queue q) { - q.single_task([=] { - Foo = 42; - }); - } + void bar(queue q) { + q.single_task([=] { + Foo = 42; + }); + } - int main(){ return 0; } - " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + int main(){ return 0; } + " + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED - ) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) + ENDIF() ENDIF() ENDIF() diff --git a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp index 082fc132de5..15c6d78d94b 100644 --- a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp @@ -88,15 +88,18 @@ using sycl_atomic_ref = sycl::atomic_ref; #endif -// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead #ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED -// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible. +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL +template +using sycl_device_global = sycl::ext::oneapi::experimental::device_global; +#else template using sycl_device_global = sycl::ext::oneapi::experimental::device_global< T, decltype(sycl::ext::oneapi::experimental::properties( sycl::ext::oneapi::experimental::device_image_scope))>; #endif +#endif } // namespace Impl } // namespace desul From a30b9aa78d2274e6eaec5056a8cdf7ae0ef2465d Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Mon, 16 Oct 2023 19:54:47 +0900 Subject: [PATCH 082/432] Fixup in README (github -> GitHub) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 033346e956e..a5b9811cec9 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ To start learning about Kokkos: - [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. For non-public questions send an email to: *crtrott(at)sandia.gov* From e156d5859e36fbc3c4f84575a0c8734689c1b3eb Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 24 Oct 2023 13:58:13 +0000 Subject: [PATCH 083/432] Check that device associated with stream matches requested device --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 15632ab5154..fb5a97b4ae9 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -372,6 +372,22 @@ void CudaInternal::fence() const { void CudaInternal::initialize(int cuda_device, cudaStream_t stream, bool manage_stream) { + // Check that the device associated with the stream matches cuda_device + CUcontext context; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPushCurrent(context))); + int device_for_stream; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&device_for_stream))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPopCurrent(&context))); + + if (device_for_stream != cuda_device) { + std::stringstream ss; + ss << "Error: The provided stream is associated with device " + << device_for_stream << " but device " << cuda_device + << " was requested in the execution space instance constructor!"; + Kokkos::abort(ss.str().c_str()); + } + KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) From cf5a859bf07f955d8b452990b0faa700125b36d1 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Tue, 24 Oct 2023 17:52:23 -0400 Subject: [PATCH 084/432] Threads: replace enum with constexpr int and enum class (#6514) * Add support for enum in spinwait * Replace enum by enum class and static constexpr in ThreadsInternal * Let the compiler deduce the template parameter * Move the ThreadsInternal::State enum to its own file * Move Kokkos_Spinwait to the Threads backend * Remove template parameters of functions in Kokkos_Threads_Spinwait * Remove unused functions * Fix indentation * Remove useless include --- Makefile.targets | 4 +- .../Kokkos_OpenMPTarget_Parallel.hpp | 1 - .../Kokkos_OpenMPTarget_Reducer.hpp | 1 - core/src/Threads/Kokkos_Threads_Instance.cpp | 67 +++++------ core/src/Threads/Kokkos_Threads_Instance.hpp | 109 +++++++----------- .../Kokkos_Threads_Spinwait.cpp} | 12 +- core/src/Threads/Kokkos_Threads_Spinwait.hpp | 43 +++++++ core/src/Threads/Kokkos_Threads_State.hpp | 39 +++++++ core/src/Threads/Kokkos_Threads_Team.hpp | 13 +-- core/src/impl/Kokkos_HostThreadTeam.cpp | 1 - core/src/impl/Kokkos_Spinwait.hpp | 109 ------------------ 11 files changed, 174 insertions(+), 225 deletions(-) rename core/src/{impl/Kokkos_Spinwait.cpp => Threads/Kokkos_Threads_Spinwait.cpp} (90%) create mode 100644 core/src/Threads/Kokkos_Threads_Spinwait.hpp create mode 100644 core/src/Threads/Kokkos_Threads_State.hpp delete mode 100644 core/src/impl/Kokkos_Spinwait.hpp diff --git a/Makefile.targets b/Makefile.targets index 0bd382f4670..75155bdd25f 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -20,8 +20,6 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp @@ -84,6 +82,8 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp +Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 9767d8e53ef..a84de76aad0 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 672271ed6b9..9b578aca112 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -18,7 +18,6 @@ #define KOKKOS_OPENMPTARGETREDUCER_HPP #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index ece6311fcc9..1c58a7c2732 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -133,7 +133,8 @@ void ThreadsInternal::global_unlock() { //---------------------------------------------------------------------------- -void ThreadsInternal::wait_yield(volatile int &flag, const int value) { +void ThreadsInternal::wait_yield(volatile ThreadState &flag, + const ThreadState value) { while (value == flag) { std::this_thread::yield(); } @@ -146,13 +147,13 @@ void ThreadsInternal::driver() { ThreadsInternal this_thread; - while (this_thread.m_pool_state == ThreadsInternal::Active) { + while (this_thread.m_pool_state == ThreadState::Active) { (*s_current_function)(this_thread, s_current_function_arg); // Deactivate thread and wait for reactivation - this_thread.m_pool_state = ThreadsInternal::Inactive; + this_thread.m_pool_state = ThreadState::Inactive; - wait_yield(this_thread.m_pool_state, ThreadsInternal::Inactive); + wait_yield(this_thread.m_pool_state, ThreadState::Inactive); } } @@ -166,7 +167,7 @@ ThreadsInternal::ThreadsInternal() m_pool_rank(0), m_pool_size(0), m_pool_fan_size(0), - m_pool_state(ThreadsInternal::Terminating) { + m_pool_state(ThreadState::Terminating) { if (&s_threads_process != this) { // A spawned thread @@ -192,21 +193,21 @@ ThreadsInternal::ThreadsInternal() m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); m_pool_size = s_thread_pool_size[0]; m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); - m_pool_state = ThreadsInternal::Active; + m_pool_state = ThreadState::Active; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); // Inform spawning process that the threads_exec entry has been set. - s_threads_process.m_pool_state = ThreadsInternal::Active; + s_threads_process.m_pool_state = ThreadState::Active; } else { // Inform spawning process that the threads_exec entry could not be set. - s_threads_process.m_pool_state = ThreadsInternal::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0; m_pool_size = 1; - m_pool_state = ThreadsInternal::Inactive; + m_pool_state = ThreadState::Inactive; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); } @@ -234,14 +235,14 @@ ThreadsInternal::~ThreadsInternal() { m_pool_size = 0; m_pool_fan_size = 0; - m_pool_state = ThreadsInternal::Terminating; + m_pool_state = ThreadState::Terminating; if (&s_threads_process != this && entry < MAX_THREAD_COUNT) { ThreadsInternal *const nil = nullptr; atomic_compare_exchange(s_threads_exec + entry, this, nil); - s_threads_process.m_pool_state = ThreadsInternal::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } @@ -278,12 +279,12 @@ void ThreadsInternal::execute_sleep(ThreadsInternal &exec, const void *) { const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); for (int i = 0; i < n; ++i) { - Impl::spinwait_while_equal( + Impl::spinwait_while_equal( exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, - ThreadsInternal::Active); + ThreadState::Active); } - exec.m_pool_state = ThreadsInternal::Inactive; + exec.m_pool_state = ThreadState::Inactive; } } // namespace Impl @@ -336,8 +337,8 @@ void ThreadsInternal::internal_fence(const std::string &name, const auto &fence_lam = [&]() { if (s_thread_pool_size[0]) { // Wait for the root thread to complete: - Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, - ThreadsInternal::Active); + Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, + ThreadState::Active); } s_current_function = nullptr; @@ -378,13 +379,13 @@ void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), // Activate threads: for (int i = s_thread_pool_size[0]; 0 < i--;) { - s_threads_exec[i]->m_pool_state = ThreadsInternal::Active; + s_threads_exec[i]->m_pool_state = ThreadState::Active; } if (s_threads_process.m_pool_size) { // Master process is the root thread, run it: (*func)(s_threads_process, arg); - s_threads_process.m_pool_state = ThreadsInternal::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } } @@ -403,7 +404,7 @@ bool ThreadsInternal::sleep() { // Activate threads: for (unsigned i = s_thread_pool_size[0]; 0 < i;) { - s_threads_exec[--i]->m_pool_state = ThreadsInternal::Active; + s_threads_exec[--i]->m_pool_state = ThreadState::Active; } return true; @@ -418,7 +419,7 @@ bool ThreadsInternal::wake() { if (s_threads_process.m_pool_base) { execute_sleep(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsInternal::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } fence(); @@ -455,16 +456,16 @@ void ThreadsInternal::execute_resize_scratch_in_serial() { for (unsigned i = s_thread_pool_size[0]; begin < i;) { ThreadsInternal &th = *s_threads_exec[--i]; - th.m_pool_state = ThreadsInternal::Active; + th.m_pool_state = ThreadState::Active; - wait_yield(th.m_pool_state, ThreadsInternal::Active); + wait_yield(th.m_pool_state, ThreadState::Active); } if (s_threads_process.m_pool_base) { deallocate_scratch_memory(s_threads_process); - s_threads_process.m_pool_state = ThreadsInternal::Active; + s_threads_process.m_pool_state = ThreadState::Active; first_touch_allocate_thread_private_scratch(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsInternal::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_current_function_arg = nullptr; @@ -663,7 +664,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { &execute_function_noop; // Initialization work function for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { - s_threads_process.m_pool_state = ThreadsInternal::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' @@ -680,8 +681,8 @@ void ThreadsInternal::initialize(int thread_count_arg) { // If spawning and initialization is successful then // an entry in 's_threads_exec' will be assigned. ThreadsInternal::spawn(); - wait_yield(s_threads_process.m_pool_state, ThreadsInternal::Inactive); - if (s_threads_process.m_pool_state == ThreadsInternal::Terminating) break; + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); + if (s_threads_process.m_pool_state == ThreadState::Terminating) break; } // Wait for all spawned threads to deactivate before zeroing the function. @@ -691,7 +692,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { ThreadsInternal *const th = ((ThreadsInternal * volatile *)s_threads_exec)[ith]; if (th) { - wait_yield(th->m_pool_state, ThreadsInternal::Active); + wait_yield(th->m_pool_state, ThreadState::Active); } else { ++thread_spawn_failed; } @@ -699,7 +700,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { s_current_function = nullptr; s_current_function_arg = nullptr; - s_threads_process.m_pool_state = ThreadsInternal::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; memory_fence(); @@ -789,11 +790,11 @@ void ThreadsInternal::finalize() { for (unsigned i = s_thread_pool_size[0]; begin < i--;) { if (s_threads_exec[i]) { - s_threads_exec[i]->m_pool_state = ThreadsInternal::Terminating; + s_threads_exec[i]->m_pool_state = ThreadState::Terminating; - wait_yield(s_threads_process.m_pool_state, ThreadsInternal::Inactive); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); - s_threads_process.m_pool_state = ThreadsInternal::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_threads_pid[i] = std::thread::id(); @@ -819,7 +820,7 @@ void ThreadsInternal::finalize() { s_threads_process.m_pool_rank = 0; s_threads_process.m_pool_size = 1; s_threads_process.m_pool_fan_size = 0; - s_threads_process.m_pool_state = ThreadsInternal::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; Kokkos::Profiling::finalize(); } diff --git a/core/src/Threads/Kokkos_Threads_Instance.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp index dfbace60939..a3fb7739e09 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -23,13 +23,13 @@ #include #include -#include - #include #include #include #include +#include +#include //---------------------------------------------------------------------------- @@ -39,25 +39,9 @@ class ThreadsInternal { public: // Fan array has log_2(NT) reduction threads plus 2 scan threads // Currently limited to 16k threads. - enum { MAX_FAN_COUNT = 16 }; - enum { MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2) }; - enum { VECTOR_LENGTH = 8 }; - - /** \brief States of a worker thread */ - enum { - Terminating ///< Termination in progress - , - Inactive ///< Exists, waiting for work - , - Active ///< Exists, performing work - , - Rendezvous ///< Exists, waiting in a barrier or reduce - - , - ScanCompleted, - ScanAvailable, - ReductionAvailable - }; + static constexpr int MAX_FAN_COUNT = 16; + static constexpr int MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2); + static constexpr int VECTOR_LENGTH = 8; private: friend class Kokkos::Threads; @@ -78,7 +62,7 @@ class ThreadsInternal { int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - int volatile m_pool_state; ///< State for global synchronizations + ThreadState volatile m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -119,7 +103,7 @@ class ThreadsInternal { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } @@ -150,7 +134,7 @@ class ThreadsInternal { //------------------------------------ - static void wait_yield(volatile int &, const int); + static void wait_yield(volatile ThreadState &, const ThreadState); //------------------------------------ // All-thread functions: @@ -166,16 +150,14 @@ class ThreadsInternal { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsInternal::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsInternal::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, - ThreadsInternal::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast @@ -193,7 +175,7 @@ class ThreadsInternal { memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsInternal::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } @@ -209,23 +191,21 @@ class ThreadsInternal { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsInternal::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsInternal::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, - ThreadsInternal::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsInternal::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } } @@ -240,8 +220,7 @@ class ThreadsInternal { for (int i = 0; i < m_pool_fan_size; ++i) { ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; - Impl::spinwait_while_equal(fan.m_pool_state, - ThreadsInternal::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join( reinterpret_cast(reduce_memory()), @@ -270,9 +249,8 @@ class ThreadsInternal { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsInternal::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } } @@ -298,8 +276,7 @@ class ThreadsInternal { ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: Active -> ReductionAvailable (or ScanAvailable) - Impl::spinwait_while_equal(fan.m_pool_state, - ThreadsInternal::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join(work_value, fan.reduce_memory()); } @@ -310,7 +287,7 @@ class ThreadsInternal { if (rev_rank) { // Set: Active -> ReductionAvailable - m_pool_state = ThreadsInternal::ReductionAvailable; + m_pool_state = ThreadState::ReductionAvailable; // Wait for contributing threads' scan value to be available. if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) { @@ -318,22 +295,19 @@ class ThreadsInternal { // Wait: Active -> ReductionAvailable // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(th.m_pool_state, - ThreadsInternal::Active); - Impl::spinwait_while_equal(th.m_pool_state, - ThreadsInternal::ReductionAvailable); + spinwait_while_equal(th.m_pool_state, ThreadState::Active); + spinwait_while_equal(th.m_pool_state, ThreadState::ReductionAvailable); f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count); } // This thread has completed inclusive scan // Set: ReductionAvailable -> ScanAvailable - m_pool_state = ThreadsInternal::ScanAvailable; + m_pool_state = ThreadState::ScanAvailable; // Wait for all threads to complete inclusive scan // Wait: ScanAvailable -> Rendezvous - Impl::spinwait_while_equal(m_pool_state, - ThreadsInternal::ScanAvailable); + spinwait_while_equal(m_pool_state, ThreadState::ScanAvailable); } //-------------------------------- @@ -341,10 +315,9 @@ class ThreadsInternal { for (int i = 0; i < m_pool_fan_size; ++i) { ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(fan.m_pool_state, - ThreadsInternal::ReductionAvailable); + spinwait_while_equal(fan.m_pool_state, ThreadState::ReductionAvailable); // Set: ScanAvailable -> Rendezvous - fan.m_pool_state = ThreadsInternal::Rendezvous; + fan.m_pool_state = ThreadState::Rendezvous; } // All threads have completed the inclusive scan. @@ -371,20 +344,18 @@ class ThreadsInternal { // Wait for all threads to copy previous thread's inclusive scan value // Wait for all threads: Rendezvous -> ScanCompleted for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsInternal::Rendezvous); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Rendezvous); } if (rev_rank) { // Set: ScanAvailable -> ScanCompleted - m_pool_state = ThreadsInternal::ScanCompleted; + m_pool_state = ThreadState::ScanCompleted; // Wait: ScanCompleted -> Active - Impl::spinwait_while_equal(m_pool_state, - ThreadsInternal::ScanCompleted); + spinwait_while_equal(m_pool_state, ThreadState::ScanCompleted); } // Set: ScanCompleted -> Active for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsInternal::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -401,9 +372,8 @@ class ThreadsInternal { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsInternal::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } for (unsigned i = 0; i < count; ++i) { @@ -411,10 +381,9 @@ class ThreadsInternal { } if (rev_rank) { - m_pool_state = ThreadsInternal::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, - ThreadsInternal::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the thread-scan before releasing threads @@ -436,7 +405,7 @@ class ThreadsInternal { } for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsInternal::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } diff --git a/core/src/impl/Kokkos_Spinwait.cpp b/core/src/Threads/Kokkos_Threads_Spinwait.cpp similarity index 90% rename from core/src/impl/Kokkos_Spinwait.cpp rename to core/src/Threads/Kokkos_Threads_Spinwait.cpp index 0a7eda29bcf..3df9dc07bf4 100644 --- a/core/src/impl/Kokkos_Spinwait.cpp +++ b/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -108,5 +108,15 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + } // namespace Impl } // namespace Kokkos diff --git a/core/src/Threads/Kokkos_Threads_Spinwait.hpp b/core/src/Threads/Kokkos_Threads_Spinwait.hpp new file mode 100644 index 00000000000..b98b6dbb73b --- /dev/null +++ b/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_SPINWAIT_HPP +#define KOKKOS_THREADS_SPINWAIT_HPP + +#include + +#include + +namespace Kokkos { +namespace Impl { + +enum class WaitMode : int { + ACTIVE // Used for tight loops to keep threads active longest + , + PASSIVE // Used to quickly yield the thread to quite down the system + , + ROOT // Never sleep or yield the root thread +}; + +void host_thread_yield(const uint32_t i, const WaitMode mode); + +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value); + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/Threads/Kokkos_Threads_State.hpp b/core/src/Threads/Kokkos_Threads_State.hpp new file mode 100644 index 00000000000..148e9aa4e05 --- /dev/null +++ b/core/src/Threads/Kokkos_Threads_State.hpp @@ -0,0 +1,39 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_STATE_HPP +#define KOKKOS_THREADS_STATE_HPP + +namespace Kokkos { +namespace Impl { +/** \brief States of a worker thread */ +enum class ThreadState { + Terminating ///< Termination in progress + , + Inactive ///< Exists, waiting for work + , + Active ///< Exists, performing work + , + Rendezvous ///< Exists, waiting in a barrier or reduce + , + ScanCompleted, + ScanAvailable, + ReductionAvailable +}; +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index 958f0053410..b811a7944ba 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -22,10 +22,11 @@ #include #include -#include #include #include +#include +#include //---------------------------------------------------------------------------- @@ -84,15 +85,13 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - Impl::spinwait_while_equal(m_team_base[j]->state(), - ThreadsInternal::Active); + spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active); } // If not root then wait for release if (m_team_rank_rev) { - m_instance->state() = ThreadsInternal::Rendezvous; - Impl::spinwait_while_equal(m_instance->state(), - ThreadsInternal::Rendezvous); + m_instance->state() = ThreadState::Rendezvous; + spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous); } return !m_team_rank_rev; @@ -103,7 +102,7 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - m_team_base[j]->state() = ThreadsInternal::Active; + m_team_base[j]->state() = ThreadState::Active; } } diff --git a/core/src/impl/Kokkos_HostThreadTeam.cpp b/core/src/impl/Kokkos_HostThreadTeam.cpp index bfe5902bf7f..11bf701b57a 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -22,7 +22,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/impl/Kokkos_Spinwait.hpp b/core/src/impl/Kokkos_Spinwait.hpp deleted file mode 100644 index c57b17d646a..00000000000 --- a/core/src/impl/Kokkos_Spinwait.hpp +++ /dev/null @@ -1,109 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_SPINWAIT_HPP -#define KOKKOS_SPINWAIT_HPP - -#include -#include - -#include - -#include - -namespace Kokkos { -namespace Impl { - -enum class WaitMode : int { - ACTIVE // Used for tight loops to keep threads active longest - , - PASSIVE // Used to quickly yield the thread to quite down the system - , - ROOT // Never sleep or yield the root thread -}; - -void host_thread_yield(const uint32_t i, const WaitMode mode); - -template -std::enable_if_t::value, void> root_spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> root_spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -#endif /* #ifndef KOKKOS_SPINWAIT_HPP */ From ae0bd54eba3aca520b1c86fbd5b0181f129e0c25 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Tue, 27 Jun 2023 16:42:39 -0600 Subject: [PATCH 085/432] Added unit tests for reduction ops and few intel svml intrinsics --- simd/src/Kokkos_SIMD_Scalar.hpp | 8 +- simd/unit_tests/TestSIMD.cpp | 1 + simd/unit_tests/include/SIMDTesting_Ops.hpp | 161 ++++++++++++++++ simd/unit_tests/include/TestSIMD_MathOps.hpp | 21 ++- .../include/TestSIMD_Reductions.hpp | 178 ++++++++++++++++++ 5 files changed, 360 insertions(+), 9 deletions(-) create mode 100644 simd/unit_tests/include/TestSIMD_Reductions.hpp diff --git a/simd/src/Kokkos_SIMD_Scalar.hpp b/simd/src/Kokkos_SIMD_Scalar.hpp index 7443f5596b5..530306d6785 100644 --- a/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/simd/src/Kokkos_SIMD_Scalar.hpp @@ -315,13 +315,13 @@ class const_where_expression, mem[static_cast(index)] = static_cast(m_value); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& - impl_get_value() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION value_type const& impl_get_value() + const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& - impl_get_mask() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION mask_type const& impl_get_mask() + const { return m_mask; } }; diff --git a/simd/unit_tests/TestSIMD.cpp b/simd/unit_tests/TestSIMD.cpp index 61c076e8246..7a1f9be2a0f 100644 --- a/simd/unit_tests/TestSIMD.cpp +++ b/simd/unit_tests/TestSIMD.cpp @@ -21,3 +21,4 @@ #include #include #include +#include diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index 6529f20e66a..c587ccf3046 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -209,4 +209,165 @@ class shift_left { } }; +class cbrt_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::cbrt(a); +#else + return Kokkos::cbrt(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::cbrt(a); + } +}; + +class exp_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::exp(a); +#else + return Kokkos::exp(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::exp(a); + } +}; + +class log_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::log(a); +#else + return Kokkos::log(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::log(a); + } +}; + +class hmin { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } +}; + +class hmax { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } +}; + +class reduce { + public: + template + auto on_host(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } +}; + #endif diff --git a/simd/unit_tests/include/TestSIMD_MathOps.hpp b/simd/unit_tests/include/TestSIMD_MathOps.hpp index 802e41efe5f..fae17a07ace 100644 --- a/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -61,13 +61,18 @@ void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n, simd_type arg; bool const loaded_arg = loader.host_load(args + i, nlanes, arg); if (!loaded_arg) continue; - auto computed_result = unary_op.on_host(arg); - decltype(computed_result) expected_result; + decltype(unary_op.on_host(arg)) expected_result; for (std::size_t lane = 0; lane < simd_type::size(); ++lane) { - if (lane < nlanes) + if (lane < nlanes) { + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) + arg[lane] = Kokkos::abs(arg[lane]); expected_result[lane] = unary_op.on_host_serial(T(arg[lane])); + } } + auto computed_result = unary_op.on_host(arg); host_check_equality(expected_result, computed_result, nlanes); } } @@ -96,6 +101,13 @@ inline void host_check_all_math_ops(const DataType (&first_args)[n], // TODO: Place fallback implementations for all simd integer types if constexpr (std::is_floating_point_v) { host_check_math_op_all_loaders(divides(), n, first_args, second_args); + +#if defined(__INTEL_COMPILER) && \ + (defined(KOKKOS_ARCH_AVX2) || defined(KOKKOS_ARCH_AVX512XEON)) + host_check_math_op_all_loaders(cbrt_op(), n, first_args); + host_check_math_op_all_loaders(exp_op(), n, first_args); + host_check_math_op_all_loaders(log_op(), n, first_args); +#endif } } @@ -282,8 +294,7 @@ TEST(simd, host_math_ops) { } TEST(simd, device_math_ops) { - Kokkos::parallel_for(Kokkos::RangePolicy>(0, 1), - simd_device_math_ops_functor()); + Kokkos::parallel_for(1, simd_device_math_ops_functor()); } #endif diff --git a/simd/unit_tests/include/TestSIMD_Reductions.hpp b/simd/unit_tests/include/TestSIMD_Reductions.hpp new file mode 100644 index 00000000000..b1aef98c2a8 --- /dev/null +++ b/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -0,0 +1,178 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_REDUCTIONS_HPP +#define KOKKOS_TEST_SIMD_REDUCTIONS_HPP + +#include +#include + +template +inline void host_check_reduction_one_loader(ReductionOp reduce_op, + std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.host_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_host_serial(value); + auto computed = reduce_op.on_host(value); + + gtest_checker().equality(expected, computed); + } +} + +template +inline void host_check_reduction_all_loaders(ReductionOp reduce_op, + std::size_t n, T const* args) { + host_check_reduction_one_loader(reduce_op, n, + args); + host_check_reduction_one_loader(reduce_op, n, args); + host_check_reduction_one_loader(reduce_op, n, args); +} + +template +inline void host_check_all_reductions(const DataType (&args)[n]) { + host_check_reduction_all_loaders(hmin(), n, args); + host_check_reduction_all_loaders(hmax(), n, args); + host_check_reduction_all_loaders(reduce(), n, args); +} + +template +inline void host_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + host_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + host_check_all_reductions(args); + } +} + +template +inline void host_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (host_check_reductions(), ...); +} + +template +inline void host_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_reductions_all_types(DataTypes()), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_one_loader( + ReductionOp reduce_op, std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.device_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_device_serial(value); + auto computed = reduce_op.on_device(value); + + kokkos_checker().equality(expected, computed); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_all_loaders( + ReductionOp reduce_op, std::size_t n, T const* args) { + device_check_reduction_one_loader(reduce_op, n, + args); + device_check_reduction_one_loader(reduce_op, n, args); + device_check_reduction_one_loader(reduce_op, n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_all_reductions( + const DataType (&args)[n]) { + device_check_reduction_all_loaders(hmin(), n, args); + device_check_reduction_all_loaders(hmax(), n, args); + device_check_reduction_all_loaders(reduce(), n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + device_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + device_check_all_reductions(args); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (device_check_reductions(), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_reductions_all_types(DataTypes()), ...); +} + +class simd_device_reduction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_reductions_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_reductions) { + host_check_reductions_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_reductions) { + Kokkos::parallel_for(1, simd_device_reduction_functor()); +} + +#endif From 9158785df8e224e3e5ec8cea5dbfd53e4c0e91c4 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 25 Oct 2023 08:48:00 -0400 Subject: [PATCH 086/432] Remove sleep and wake functions --- core/src/Cuda/Kokkos_Cuda.hpp | 20 ------- core/src/SYCL/Kokkos_SYCL.hpp | 6 -- core/src/Threads/Kokkos_Threads_Instance.cpp | 63 -------------------- core/src/Threads/Kokkos_Threads_Instance.hpp | 3 - 4 files changed, 92 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 8bfaf8317b6..3fda01964f6 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -137,26 +137,6 @@ class Cuda { #endif } - /** \brief Set the device in a "sleep" state. - * - * This function sets the device in a "sleep" state in which it is - * not ready for work. This may consume less resources than if the - * device were in an "awake" state, but it may also take time to - * bring the device from a sleep state to be ready for work. - * - * \return True if the device is in the "sleep" state, else false if - * the device is actively working and could not enter the "sleep" - * state. - */ - static bool sleep(); - - /// \brief Wake the device from the 'sleep' state so it is ready for work. - /// - /// \return True if the device is in the "ready" state, else "false" - /// if the device is actively working (which also means that it's - /// awake). - static bool wake(); - /// \brief Wait until all dispatched functors complete. /// /// The parallel_for or parallel_reduce dispatch of a functor may diff --git a/core/src/SYCL/Kokkos_SYCL.hpp b/core/src/SYCL/Kokkos_SYCL.hpp index be6b4b89302..8de860c87f6 100644 --- a/core/src/SYCL/Kokkos_SYCL.hpp +++ b/core/src/SYCL/Kokkos_SYCL.hpp @@ -86,12 +86,6 @@ class SYCL { #endif } - /** \brief Set the device in a "sleep" state. */ - static bool sleep(); - - /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ - static bool wake(); - /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 1c58a7c2732..27527ab7b3f 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -42,7 +42,6 @@ namespace Kokkos { namespace Impl { namespace { -std::mutex host_internal_cppthread_mutex; // std::thread compatible driver. // Recovery from an exception would require constant intra-thread health @@ -125,12 +124,6 @@ bool ThreadsInternal::is_process() { return master_pid == std::this_thread::get_id(); } -void ThreadsInternal::global_lock() { host_internal_cppthread_mutex.lock(); } - -void ThreadsInternal::global_unlock() { - host_internal_cppthread_mutex.unlock(); -} - //---------------------------------------------------------------------------- void ThreadsInternal::wait_yield(volatile ThreadState &flag, @@ -269,24 +262,6 @@ ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) { return th; } -//---------------------------------------------------------------------------- - -void ThreadsInternal::execute_sleep(ThreadsInternal &exec, const void *) { - ThreadsInternal::global_lock(); - ThreadsInternal::global_unlock(); - - const int n = exec.m_pool_fan_size; - const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); - - for (int i = 0; i < n; ++i) { - Impl::spinwait_while_equal( - exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, - ThreadState::Active); - } - - exec.m_pool_state = ThreadState::Inactive; -} - } // namespace Impl } // namespace Kokkos @@ -391,44 +366,6 @@ void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), //---------------------------------------------------------------------------- -bool ThreadsInternal::sleep() { - verify_is_process("ThreadsInternal::sleep", true); - - if (&execute_sleep == s_current_function) return false; - - fence(); - - ThreadsInternal::global_lock(); - - s_current_function = &execute_sleep; - - // Activate threads: - for (unsigned i = s_thread_pool_size[0]; 0 < i;) { - s_threads_exec[--i]->m_pool_state = ThreadState::Active; - } - - return true; -} - -bool ThreadsInternal::wake() { - verify_is_process("ThreadsInternal::wake", true); - - if (&execute_sleep != s_current_function) return false; - - ThreadsInternal::global_unlock(); - - if (s_threads_process.m_pool_base) { - execute_sleep(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadState::Inactive; - } - - fence(); - - return true; -} - -//---------------------------------------------------------------------------- - void ThreadsInternal::execute_resize_scratch_in_serial() { const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; diff --git a/core/src/Threads/Kokkos_Threads_Instance.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp index a3fb7739e09..6d7162f5373 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -81,7 +81,6 @@ class ThreadsInternal { static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); - static void execute_sleep(ThreadsInternal &, const void *); ThreadsInternal(const ThreadsInternal &); ThreadsInternal &operator=(const ThreadsInternal &); @@ -424,8 +423,6 @@ class ThreadsInternal { static void internal_fence( const std::string &, Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static bool sleep(); - static bool wake(); /* Dynamic Scheduling related functionality */ // Initialize the work range for this thread From 1fcce6936bf667339350381c8b265b5d82d810da Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Oct 2023 15:38:01 -0400 Subject: [PATCH 087/432] Remove extra constructor --- core/src/Cuda/Kokkos_Cuda.hpp | 2 - core/src/Cuda/Kokkos_Cuda_Instance.cpp | 60 +++++-------------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 2 +- core/unit_test/CMakeLists.txt | 6 -- .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 34 ----------- 5 files changed, 17 insertions(+), 87 deletions(-) delete mode 100644 core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 7ca89299fac..8bfaf8317b6 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -189,8 +189,6 @@ class Cuda { KOKKOS_DEPRECATED Cuda(cudaStream_t stream, bool manage_stream); - Cuda(int device_id, cudaStream_t stream); - //-------------------------------------------------------------------------- //! Free any resources being consumed by the device. static void impl_finalize(); diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index fb5a97b4ae9..26d428fa871 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -370,32 +370,28 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(int cuda_device, cudaStream_t stream, - bool manage_stream) { +void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { + KOKKOS_EXPECTS(!is_initialized()); + + if (was_finalized) + Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); + was_initialized = true; + // Check that the device associated with the stream matches cuda_device CUcontext context; KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context))); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPushCurrent(context))); - int device_for_stream; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&device_for_stream))); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPopCurrent(&context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); - if (device_for_stream != cuda_device) { - std::stringstream ss; - ss << "Error: The provided stream is associated with device " - << device_for_stream << " but device " << cuda_device - << " was requested in the execution space instance constructor!"; - Kokkos::abort(ss.str().c_str()); - } - - KOKKOS_EXPECTS(!is_initialized()); + // FIXME_CUDA multiple devices + if (m_cudaDev != Cuda().cuda_device()) + Kokkos::abort( + "Currently, the device id must match the device id used when Kokkos " + "was initialized!"); - if (was_finalized) - Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); was_initialized = true; - m_cudaDev = cuda_device; - //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. @@ -813,7 +809,7 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default cudaStream_t singleton_stream; KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); - Impl::CudaInternal::singleton().initialize(cuda_device_id, singleton_stream, + Impl::CudaInternal::singleton().initialize(singleton_stream, /*manage*/ true); } @@ -864,31 +860,7 @@ Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream) }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); - m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev, - stream, static_cast(manage_stream)); -} - -Cuda::Cuda(int device_id, cudaStream_t stream) - : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { - Impl::CudaInternal::singleton().verify_is_initialized( - "Cuda instance constructor"); - const int n_devices = Kokkos::Cuda::detect_device_count(); - if (device_id < 0 || device_id >= n_devices) { - std::stringstream ss; - ss << "Error: Requested GPU with invalid id '" << device_id << "'." - << " The device id must be in the interval [0, " << n_devices << ")!" - << " Raised by Kokkos::Cuda::Cuda().\n"; - Kokkos::abort(ss.str().c_str()); - } - // FIXME_CUDA - if (device_id != Cuda().cuda_device()) - Kokkos::abort( - "Currently, the device id must match the device id used when Kokkos " - "was initialized!"); - m_space_instance->initialize(device_id, stream, /*manage_stream*/ false); + m_space_instance->initialize(stream, static_cast(manage_stream)); } void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 2a0c6ff8466..6692144a9e0 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -156,7 +156,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(int cuda_device, cudaStream_t stream, bool manage_stream); + void initialize(cudaStream_t stream, bool manage_stream); void finalize(); void print_configuration(std::ostream&) const; diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 8c9dd2679d0..e633bdba7f0 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -763,12 +763,6 @@ if(Kokkos_ENABLE_CUDA) UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaInterOpStreamsMultiGPU - SOURCES - UnitTestMain.cpp - cuda/TestCuda_InterOp_StreamsMultiGPU.cpp - ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_CudaGraph SOURCES diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp deleted file mode 100644 index a3c71315700..00000000000 --- a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp +++ /dev/null @@ -1,34 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include -#include - -namespace Test { -// Test Interoperability with Cuda Streams and multiple GPUs. -TEST(cuda, raw_cuda_streams) { - Kokkos::ScopeGuard scope_guard; - - cudaStream_t stream; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream)); - { - TEST_EXECSPACE cuda_instance(TEST_EXECSPACE().cuda_device(), stream); - ASSERT_EQ(cuda_instance.cuda_device(), TEST_EXECSPACE().cuda_device()); - } - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream)); -} -} // namespace Test From 3bcf9657f55209fdfd3a5227e23484e1e2d03f7e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Oct 2023 17:35:29 -0400 Subject: [PATCH 088/432] Prefer defaulted default constructor for Bitset (#6524) * Prefer defaulted default constructor for Bitset The default argument to the constructor that takes the size of the bitset was deferring to another constructor that creates an empty view with a label argument. This alocates 128 bits for the view header. This showed when constructing an UnorderedMap with a pointless 128-bit "header-only" allocation which implies an unnecessary fence. * Fixup update Bitset allocated unit test * Add tool-based test checking bitset default constructor does not allocate * Check other Bitset constructors do allocate * Drop weak test to see if NVCC ICE is gone --- containers/src/Kokkos_Bitset.hpp | 9 +++++---- containers/unit_tests/TestBitset.hpp | 25 ++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/containers/src/Kokkos_Bitset.hpp b/containers/src/Kokkos_Bitset.hpp index 9ccb52cf119..f50ab0a0f7e 100644 --- a/containers/src/Kokkos_Bitset.hpp +++ b/containers/src/Kokkos_Bitset.hpp @@ -74,9 +74,10 @@ class Bitset { using block_view_type = View>; public: - /// constructor + Bitset() = default; + /// arg_size := number of bit in set - Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {} + Bitset(unsigned arg_size) : Bitset(Kokkos::view_alloc(), arg_size) {} template Bitset(const Impl::ViewCtorProp& arg_prop, unsigned arg_size) @@ -291,8 +292,8 @@ class Bitset { } private: - unsigned m_size; - unsigned m_last_block_mask; + unsigned m_size = 0; + unsigned m_last_block_mask = 0; block_view_type m_blocks; private: diff --git a/containers/unit_tests/TestBitset.hpp b/containers/unit_tests/TestBitset.hpp index 3ad0d2bf573..9923453f72c 100644 --- a/containers/unit_tests/TestBitset.hpp +++ b/containers/unit_tests/TestBitset.hpp @@ -23,6 +23,8 @@ #include #include +#include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp> + namespace Test { namespace Impl { @@ -155,7 +157,7 @@ void test_bitset() { { unsigned ts = 100u; - bitset_type b1; + bitset_type b1(Kokkos::view_alloc("MyBitset"), 0); ASSERT_TRUE(b1.is_allocated()); b1 = bitset_type(ts); @@ -165,6 +167,9 @@ void test_bitset() { ASSERT_TRUE(b1.is_allocated()); ASSERT_TRUE(b2.is_allocated()); ASSERT_TRUE(b3.is_allocated()); + + bitset_type b4; + ASSERT_FALSE(b4.is_allocated()); } std::array test_sizes = { @@ -237,6 +242,24 @@ void test_bitset() { } TEST(TEST_CATEGORY, bitset) { test_bitset(); } + +TEST(TEST_CATEGORY, bitset_default_constructor_no_alloc) { + using namespace Kokkos::Test::Tools; + listen_tool_events(Config::DisableAll(), Config::EnableAllocs()); + + auto success = validate_absence( + [&]() { + Kokkos::Bitset bs; + EXPECT_FALSE(bs.is_allocated()); + }, + [&](AllocateDataEvent) { + return MatchDiagnostic{true, {"Found alloc event"}}; + }); + ASSERT_TRUE(success); + + listen_tool_events(Config::DisableAll()); +} + } // namespace Test #endif // KOKKOS_TEST_BITSET_HPP From 09756717d98b0e600843e6e6e54c171715156b7d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 26 Oct 2023 08:28:05 -0400 Subject: [PATCH 089/432] SYCL: Use host-pinned memory to copy reduction/scan result (#6500) * SYCL: Use host-pinned memory to copy reduction/scan result * Remove unused variable * m_shared_memory_lock -> m_host_scratch_lock; improve comments * Add comment for choosing memcpy over fence+deep_copy * m_[host_]scratch_lock->m_scratch_buffers_lock --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 30 ++++++++++++- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 3 ++ .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 12 +++--- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 37 ++++++++++------ .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 31 ++++++++----- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 43 ++++++++++++------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 39 +++++++++-------- 7 files changed, 130 insertions(+), 65 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 080369770d7..3f931c016ff 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -54,7 +54,7 @@ Kokkos::View sycl_global_unique_token_locks( } SYCLInternal::~SYCLInternal() { - if (!was_finalized || m_scratchSpace || m_scratchFlags) { + if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " "Kokkos::Experimental::SYCL::finalize()" << std::endl; @@ -199,11 +199,15 @@ void SYCLInternal::finalize() { using RecordSYCL = Kokkos::Impl::SharedAllocationRecord; if (nullptr != m_scratchSpace) RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace)); + if (nullptr != m_scratchHost) + RecordSYCL::decrement(RecordSYCL::get_record(m_scratchHost)); if (nullptr != m_scratchFlags) RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags)); m_syclDev = -1; m_scratchSpaceCount = 0; m_scratchSpace = nullptr; + m_scratchHostCount = 0; + m_scratchHost = nullptr; m_scratchFlagsCount = 0; m_scratchFlags = nullptr; @@ -250,6 +254,30 @@ sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { return m_scratchSpace; } +sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { + if (verify_is_initialized("scratch_unified") && + m_scratchHostCount < scratch_count(size)) { + m_scratchHostCount = scratch_count(size); + + using Record = Kokkos::Impl::SharedAllocationRecord< + Kokkos::Experimental::SYCLHostUSMSpace, void>; + + if (m_scratchHost) Record::decrement(Record::get_record(m_scratchHost)); + + std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( + m_scratchHostCount, sizeScratchGrain); + Record* const r = Record::allocate( + Kokkos::Experimental::SYCLHostUSMSpace(*m_queue), + "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size); + + Record::increment(r); + + m_scratchHost = reinterpret_cast(r->data()); + } + + return m_scratchHost; +} + sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 51a617054d6..7f9ce48109c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -45,6 +45,7 @@ class SYCLInternal { sycl::device_ptr scratch_space(const std::size_t size); sycl::device_ptr scratch_flags(const std::size_t size); + sycl::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, @@ -60,6 +61,8 @@ class SYCLInternal { std::size_t m_scratchSpaceCount = 0; sycl::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::host_ptr m_scratchHost = nullptr; std::size_t m_scratchFlagsCount = 0; sycl::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 4fc5818ce9b..1f2629407b0 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -46,9 +46,9 @@ class Kokkos::Impl::ParallelFor, int m_shmem_size; sycl::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor/Reduce modify the team scratch memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_lock; + // Only let one ParallelFor instance at a time use the team scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -141,9 +141,9 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()), - m_scratch_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 6964c2dbcf0..bc2e47658ed 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -78,7 +78,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -95,6 +95,11 @@ class Kokkos::Impl::ParallelReduce results_ptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; @@ -109,8 +114,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); cgh.single_task([=]() { const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -148,8 +155,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); @@ -296,11 +305,13 @@ class Kokkos::Impl::ParallelReduce( - m_space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + m_space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(value_type) * value_count); } return last_reduction_event; @@ -335,9 +346,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 8c900cfa428..d93962419c9 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -51,7 +51,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -70,8 +70,15 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); sycl::event last_reduction_event; @@ -320,11 +327,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -354,9 +363,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 07145b0fb93..dc3d77d15d7 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -59,9 +59,10 @@ class Kokkos::Impl::ParallelReduce m_scratch_lock; + // Only let one ParallelReduce instance at a time use the team scratch memory + // and the host scratch memory. The constructor acquires the mutex which is + // released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -79,6 +80,11 @@ class Kokkos::Impl::ParallelReduce>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; @@ -89,8 +95,10 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -164,8 +172,11 @@ class Kokkos::Impl::ParallelReduce local_mem, sycl::device_ptr results_ptr) { - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>( + host_result_ptr); auto lambda = [=](sycl::nd_item<2> item) { auto n_wgroups = item.get_group_range()[1]; int wgroup_size = @@ -358,11 +369,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result not " + "device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -448,9 +461,9 @@ class Kokkos::Impl::ParallelReducem_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { initialize(); } }; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 04425723e19..e0c79dcda41 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -111,13 +111,13 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - pointer_type m_scratch_space = nullptr; - const pointer_type m_result_ptr; + sycl::host_ptr m_scratch_host = nullptr; + pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one Parallel/Scan modify the shared memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_shared_memory_lock; + // Only let one ParallelScan instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; private: template @@ -253,7 +253,8 @@ class ParallelScanSYCLBase { global_mem = static_cast>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_space = global_mem; + m_scratch_host = static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; @@ -281,10 +282,11 @@ class ParallelScanSYCLBase { // Write results to global memory auto update_global_results = q.submit([&](sycl::handler& cgh) { - auto result_ptr_device_accessible = m_result_ptr_device_accessible; // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr // directly. - auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; + pointer_type result_ptr = m_result_ptr_device_accessible + ? m_result_ptr + : static_cast(m_scratch_host); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(perform_work_group_scans); @@ -293,7 +295,6 @@ class ParallelScanSYCLBase { cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { - auto global_mem_copy = global_mem; const index_type global_id = item.get_global_linear_id(); const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = @@ -312,9 +313,7 @@ class ParallelScanSYCLBase { else functor(WorkTag(), global_id + begin, update, true); - global_mem_copy[global_id] = update; - if (global_id == size - 1 && result_ptr_device_accessible) - *result_ptr = update; + if (global_id == size - 1) *result_ptr = update; } }); }); @@ -351,9 +350,9 @@ class ParallelScanSYCLBase { m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_shared_memory_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_scratch_buffers_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexScratchSpace) {} }; } // namespace Kokkos::Impl @@ -390,11 +389,13 @@ class Kokkos::Impl::ParallelScanWithTotal< Base::impl_execute([&]() { const long long nwork = Base::m_policy.end() - Base::m_policy.begin(); if (nwork > 0 && !Base::m_result_ptr_device_accessible) { + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x + // slower. + m_exec.fence( + "Kokkos::Impl::ParallelReduce::execute: " + "result not device-accessible"); const int size = Base::m_functor_reducer.get_reducer().value_size(); - DeepCopy(m_exec, Base::m_result_ptr, - Base::m_scratch_space + nwork - 1, - size); + std::memcpy(Base::m_result_ptr, Base::m_scratch_host, size); } }); } From b875be75d9c6a0c91e77503552fbdc71624cf73f Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Tue, 10 Oct 2023 10:53:36 -0400 Subject: [PATCH 090/432] Remove unused variables --- core/src/Threads/Kokkos_Threads_Instance.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 27527ab7b3f..f5d9205a598 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -70,17 +70,13 @@ std::pair int s_thread_pool_size[3] = {0, 0, 0}; -unsigned s_current_reduce_size = 0; -unsigned s_current_shared_size = 0; - void (*volatile s_current_function)(ThreadsInternal &, const void *); const void *volatile s_current_function_arg = nullptr; struct Sentinel { ~Sentinel() { if (s_thread_pool_size[0] || s_thread_pool_size[1] || - s_thread_pool_size[2] || s_current_reduce_size || - s_current_shared_size || s_current_function || s_current_function_arg || + s_thread_pool_size[2] || s_current_function || s_current_function_arg || s_threads_exec[0]) { std::cerr << "ERROR : Process exiting while Kokkos::Threads is still " "initialized" @@ -511,8 +507,6 @@ void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { if (nullptr == s_threads_process.m_pool_base) { s << " Asynchronous"; } - s << " ReduceScratch[" << s_current_reduce_size << "]" - << " SharedScratch[" << s_current_shared_size << "]"; s << std::endl; if (detail) { From 6ac5aa84642a7aee19102c7a63d5dbd5f355ef88 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Tue, 10 Oct 2023 11:11:39 -0400 Subject: [PATCH 091/432] Remove Sentinel struct from Threads --- core/src/Threads/Kokkos_Threads_Instance.cpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index f5d9205a598..0610d05c5ce 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -73,18 +73,6 @@ int s_thread_pool_size[3] = {0, 0, 0}; void (*volatile s_current_function)(ThreadsInternal &, const void *); const void *volatile s_current_function_arg = nullptr; -struct Sentinel { - ~Sentinel() { - if (s_thread_pool_size[0] || s_thread_pool_size[1] || - s_thread_pool_size[2] || s_current_function || s_current_function_arg || - s_threads_exec[0]) { - std::cerr << "ERROR : Process exiting while Kokkos::Threads is still " - "initialized" - << std::endl; - } - } -}; - inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); unsigned count = 0; @@ -549,8 +537,6 @@ void ThreadsInternal::initialize(int thread_count_arg) { unsigned use_numa_count = 0; unsigned use_cores_per_numa = 0; bool allow_asynchronous_threadpool = false; - // need to provide an initializer for Intel compilers - static const Sentinel sentinel = {}; const bool is_initialized = 0 != s_thread_pool_size[0]; From 7d31c22738a64ebf7e000dc9aab1c1a8258c3d3e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Tue, 10 Oct 2023 11:40:28 -0400 Subject: [PATCH 092/432] Small cleanup of ThreadsInternal::initialize --- core/src/Threads/Kokkos_Threads_Instance.cpp | 69 ++++++++------------ 1 file changed, 28 insertions(+), 41 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 0610d05c5ce..829c8d43700 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -532,11 +532,7 @@ void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { int ThreadsInternal::is_initialized() { return nullptr != s_threads_exec[0]; } void ThreadsInternal::initialize(int thread_count_arg) { - // legacy arguments - unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; - unsigned use_numa_count = 0; - unsigned use_cores_per_numa = 0; - bool allow_asynchronous_threadpool = false; + unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; const bool is_initialized = 0 != s_thread_pool_size[0]; @@ -546,10 +542,8 @@ void ThreadsInternal::initialize(int thread_count_arg) { s_threads_exec[i] = nullptr; if (!is_initialized) { - // If thread_count, use_numa_count, or use_cores_per_numa are zero - // then they will be given default values based upon hwloc detection - // and allowed asynchronous execution. - + // If thread_count is zero then it will be given default values based upon + // hwloc detection. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); @@ -562,17 +556,18 @@ void ThreadsInternal::initialize(int thread_count_arg) { : 1; } - const unsigned thread_spawn_begin = hwloc::thread_mapping( - "Kokkos::Threads::initialize", allow_asynchronous_threadpool, - thread_count, use_numa_count, use_cores_per_numa, s_threads_coord); + const bool allow_asynchronous_threadpool = false; + unsigned use_numa_count = 0; + unsigned use_cores_per_numa = 0; + hwloc::thread_mapping("Kokkos::Threads::initialize", + allow_asynchronous_threadpool, thread_count, + use_numa_count, use_cores_per_numa, s_threads_coord); const std::pair proc_coord = s_threads_coord[0]; - if (thread_spawn_begin) { - // Synchronous with s_threads_coord[0] as the process core - // Claim entry #0 for binding the process core. - s_threads_coord[0] = std::pair(~0u, ~0u); - } + // Synchronous with s_threads_coord[0] as the process core + // Claim entry #0 for binding the process core. + s_threads_coord[0] = std::pair(~0u, ~0u); s_thread_pool_size[0] = thread_count; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count; @@ -580,7 +575,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { s_current_function = &execute_function_noop; // Initialization work function - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + for (unsigned ith = 1; ith < thread_count; ++ith) { s_threads_process.m_pool_state = ThreadState::Inactive; // If hwloc available then spawned thread will @@ -604,10 +599,10 @@ void ThreadsInternal::initialize(int thread_count_arg) { // Wait for all spawned threads to deactivate before zeroing the function. - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. ThreadsInternal *const th = - ((ThreadsInternal * volatile *)s_threads_exec)[ith]; + ((ThreadsInternal *volatile *)s_threads_exec)[ith]; if (th) { wait_yield(th->m_pool_state, ThreadState::Active); } else { @@ -628,27 +623,19 @@ void ThreadsInternal::initialize(int thread_count_arg) { Kokkos::hwloc::bind_this_thread(proc_coord); } - if (thread_spawn_begin) { // Include process in pool. - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - s_threads_exec[0] = &s_threads_process; - s_threads_process.m_numa_rank = coord.first; - s_threads_process.m_numa_core_rank = coord.second; - s_threads_process.m_pool_base = s_threads_exec; - s_threads_process.m_pool_rank = - thread_count - 1; // Reversed for scan-compatible reductions - s_threads_process.m_pool_size = thread_count; - s_threads_process.m_pool_fan_size = fan_size( - s_threads_process.m_pool_rank, s_threads_process.m_pool_size); - s_threads_pid[s_threads_process.m_pool_rank] = - std::this_thread::get_id(); - } else { - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 0; - s_threads_process.m_pool_fan_size = 0; - } + const std::pair coord = + Kokkos::hwloc::get_this_thread_coordinate(); + + s_threads_exec[0] = &s_threads_process; + s_threads_process.m_numa_rank = coord.first; + s_threads_process.m_numa_core_rank = coord.second; + s_threads_process.m_pool_base = s_threads_exec; + s_threads_process.m_pool_rank = + thread_count - 1; // Reversed for scan-compatible reductions + s_threads_process.m_pool_size = thread_count; + s_threads_process.m_pool_fan_size = fan_size( + s_threads_process.m_pool_rank, s_threads_process.m_pool_size); + s_threads_pid[s_threads_process.m_pool_rank] = std::this_thread::get_id(); // Initial allocations: ThreadsInternal::resize_scratch(1024, 1024); From bb759df490419dfa3669822c4216cca4795ef3e4 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Oct 2023 10:15:55 -0400 Subject: [PATCH 093/432] Remove useless forward declaration --- core/src/Threads/Kokkos_Threads.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Threads/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp index c84ef0d405c..558d5a93984 100644 --- a/core/src/Threads/Kokkos_Threads.hpp +++ b/core/src/Threads/Kokkos_Threads.hpp @@ -40,7 +40,6 @@ static_assert(false, namespace Kokkos { namespace Impl { -class ThreadsInternal; enum class fence_is_static { yes, no }; } // namespace Impl } // namespace Kokkos From a417450bb7f6c5dc79de722df7519db9868364c5 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Oct 2023 10:47:15 -0400 Subject: [PATCH 094/432] Remove spawn function --- core/src/Threads/Kokkos_Threads_Instance.cpp | 13 ++----------- core/src/Threads/Kokkos_Threads_Instance.hpp | 1 - 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 829c8d43700..d1dad8afdc7 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -92,16 +92,6 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { namespace Kokkos { namespace Impl { -//---------------------------------------------------------------------------- -// Spawn a thread - -void ThreadsInternal::spawn() { - std::thread t(internal_cppthread_driver); - t.detach(); -} - -//---------------------------------------------------------------------------- - bool ThreadsInternal::is_process() { static const std::thread::id master_pid = std::this_thread::get_id(); @@ -592,7 +582,8 @@ void ThreadsInternal::initialize(int thread_count_arg) { // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successful then // an entry in 's_threads_exec' will be assigned. - ThreadsInternal::spawn(); + std::thread t(internal_cppthread_driver); + t.detach(); wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); if (s_threads_process.m_pool_state == ThreadState::Terminating) break; } diff --git a/core/src/Threads/Kokkos_Threads_Instance.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp index 6d7162f5373..ff010c1ccd0 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -77,7 +77,6 @@ class ThreadsInternal { static void global_lock(); static void global_unlock(); - static void spawn(); static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); From 33010ecc3d169a44f3fddf66e7913ecad9e0f6fa Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 11 Oct 2023 16:13:07 -0400 Subject: [PATCH 095/432] Add comments --- core/src/Threads/Kokkos_Threads_Instance.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index d1dad8afdc7..8801eca0d4f 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -136,8 +136,8 @@ ThreadsInternal::ThreadsInternal() m_pool_fan_size(0), m_pool_state(ThreadState::Terminating) { if (&s_threads_process != this) { - // A spawned thread - + // The code in the if is executed by a spawned thread not by the root + // thread ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding @@ -326,7 +326,9 @@ void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), // Make sure function and arguments are written before activating threads. memory_fence(); - // Activate threads: + // Activate threads. The spawned threads will start working on + // s_current_function. The root thread is only set to active, we still need to + // call s_current_function. for (int i = s_thread_pool_size[0]; 0 < i--;) { s_threads_exec[i]->m_pool_state = ThreadState::Active; } From c4d0dfe02092e2ecb157039288116db52a62d1d7 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 26 Oct 2023 10:08:01 -0400 Subject: [PATCH 096/432] Fix indentation --- core/src/Threads/Kokkos_Threads_Instance.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 8801eca0d4f..78c63abfa1c 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -595,7 +595,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. ThreadsInternal *const th = - ((ThreadsInternal *volatile *)s_threads_exec)[ith]; + ((ThreadsInternal * volatile *)s_threads_exec)[ith]; if (th) { wait_yield(th->m_pool_state, ThreadState::Active); } else { From b4f27c87f2b70f410859b4eca83b382dd0819540 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 26 Oct 2023 10:31:02 -0400 Subject: [PATCH 097/432] Fix typo in macro guard --- core/src/HIP/Kokkos_HIP_Parallel_Range.hpp | 4 ++-- core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 4 ++-- core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 4 ++-- core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp | 4 ++-- core/src/Serial/Kokkos_Serial_Parallel_Range.hpp | 4 ++-- core/src/Serial/Kokkos_Serial_Parallel_Team.hpp | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 26e8be4698a..7356dd2d305 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_HIP_PARALLEL_RANGE_HPP -#define KOKKO_HIP_PARALLEL_RANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_RANGE_HPP #include diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 3fe568ac361..17b4597f988 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_HIP_PARALLEL_TEAM_HPP -#define KOKKO_HIP_PARALLEL_TEAM_HPP +#ifndef KOKKOS_HIP_PARALLEL_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_TEAM_HPP #include diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index e0c79dcda41..82151a73f86 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP -#define KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP +#ifndef KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP #include #include diff --git a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 69787aa5001..67978aa3e9f 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_MDRANGE_HPP -#define KOKKO_SERIAL_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP #include #include diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 56894716dbd..91b4c567113 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_RANGE_HPP -#define KOKKO_SERIAL_PARALLEL_RANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_RANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_RANGE_HPP #include diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index 0876f1af229..f34a7daaca0 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_TEAM_HPP -#define KOKKO_SERIAL_PARALLEL_TEAM_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_TEAM_HPP +#define KOKKOS_SERIAL_PARALLEL_TEAM_HPP #include From 840d6b775a33616ec17ebeb3e4f9a0cbc742e4c9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 26 Oct 2023 17:44:22 -0400 Subject: [PATCH 098/432] Reduce number of View constructor instantiations View constructors with templated label take std::string and runtime unmanaged View constructors taking pointers now constrained. This reduces the number of instantiation when passing C string arguments to the View constructors. Co-authored-by: Christian Trott --- core/src/Kokkos_View.hpp | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index bcbb28014cd..c953680dc78 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1489,26 +1489,20 @@ class View : public ViewTraits { } // Allocate with label and layout - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, - typename traits::array_layout> const& arg_layout) + explicit inline View(std::string const& arg_label, + typename traits::array_layout const& arg_layout) : View(Impl::ViewCtorProp(arg_label), arg_layout) {} // Allocate label and layout, must disambiguate from subview constructor. - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, const size_t> - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + explicit inline View(std::string const& arg_label, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) : View(Impl::ViewCtorProp(arg_label), typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) { @@ -1565,8 +1559,10 @@ class View : public ViewTraits { arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); } + template >> explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + PointerType arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -1582,8 +1578,10 @@ class View : public ViewTraits { "overload taking a layout object instead."); } + template >> explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + PointerType arg_ptr, const typename traits::array_layout& arg_layout) : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} //---------------------------------------- From a41df08a786c64b6f39b6a2b2e1df9885d7eabff Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 27 Oct 2023 08:45:58 +0200 Subject: [PATCH 099/432] Bump HPX version used in CI to 1.9.0 --- .github/workflows/continuous-integration-workflow-hpx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index 8b39350dc87..187c4e61ec3 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -36,7 +36,7 @@ jobs: uses: actions/checkout@v3 with: repository: STELLAR-GROUP/hpx - ref: 1.8.0 + ref: v1.9.0 path: hpx - uses: actions/cache@v3 id: cache-hpx From 6b4ee34ee9473801dcac18afd9b5722f63f27c07 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 26 Oct 2023 13:51:21 -0400 Subject: [PATCH 100/432] Split files in HIP backend --- .../HIP/Kokkos_HIP_ParallelFor_MDRange.hpp | 173 ++++ core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp | 100 ++ core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp | 177 ++++ ... => Kokkos_HIP_ParallelReduce_MDRange.hpp} | 146 +-- .../HIP/Kokkos_HIP_ParallelReduce_Range.hpp | 329 ++++++ .../HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 393 ++++++++ ....hpp => Kokkos_HIP_ParallelScan_Range.hpp} | 378 +------ core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 936 ------------------ .../src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp | 421 ++++++++ core/src/decl/Kokkos_Declare_HIP.hpp | 10 +- 10 files changed, 1606 insertions(+), 1457 deletions(-) create mode 100644 core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp rename core/src/HIP/{Kokkos_HIP_Parallel_MDRange.hpp => Kokkos_HIP_ParallelReduce_MDRange.hpp} (61%) create mode 100644 core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp rename core/src/HIP/{Kokkos_HIP_Parallel_Range.hpp => Kokkos_HIP_ParallelScan_Range.hpp} (50%) delete mode 100644 core/src/HIP/Kokkos_HIP_Parallel_Team.hpp create mode 100644 core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp new file mode 100644 index 00000000000..db07c360b5c --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp @@ -0,0 +1,173 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +// ParallelFor +template +class ParallelFor, HIP> { + public: + using Policy = Kokkos::MDRangePolicy; + using functor_type = FunctorType; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + Kokkos::Impl::DeviceIterateTile(m_policy, + m_functor) + .exec_range(); + } + + inline void execute() const { + using ClosureType = ParallelFor; + if (m_policy.m_num_tiles == 0) return; + auto const maxblocks = hip_internal_maximum_grid_count(); + if (Policy::rank == 2) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + 1); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 3) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], + m_policy.m_tile[2]); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2], m_policy.m_tile[3]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 + // to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; + // id4,id5 to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4] * m_policy.m_tile[5]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); + } + + } // end execute + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + + template + static int max_tile_size_product(const Policy&, const Functor&) { + using closure_type = + ParallelFor, HIP>; + unsigned block_size = hip_get_max_blocksize(); + if (block_size == 0) + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " + "tile size.")); + return block_size; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp new file mode 100644 index 00000000000..9355c1c75fb --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -0,0 +1,100 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP + +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(i); + } + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(TagType(), i); + } + + public: + using functor_type = FunctorType; + + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + const Member work_stride = blockDim.y * gridDim.x; + const Member work_end = m_policy.end(); + + for (Member iwork = + m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; + iwork < work_end; + iwork = iwork < work_end - work_stride ? iwork + work_stride + : work_end) { + this->template exec_range(iwork); + } + } + + inline void execute() const { + const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + + using DriverType = ParallelFor; + const int block_size = + Kokkos::Impl::hip_get_preferred_blocksize(); + const dim3 block(1, block_size, 1); + const dim3 grid( + typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " + "valid execution configuration.")); + } + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp new file mode 100644 index 00000000000..bf0c2193383 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -0,0 +1,177 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, HIP> { + public: + using Policy = TeamPolicy; + using functor_type = FunctorType; + using size_type = HIP::size_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + + FunctorType const m_functor; + Policy const m_policy; + size_type const m_league_size; + int m_team_size; + size_type const m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(member); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(TagType(), member); + } + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + __device__ inline void operator()() const { + // Iterate this block through the league + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team(typename Policy::member_type( + kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, + static_cast(static_cast(m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size)); + } + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + inline void execute() const { + int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; + dim3 const grid(static_cast(m_league_size), 1, 1); + dim3 const block(static_cast(m_vector_size), + static_cast(m_team_size), 1); + + using closure_type = + ParallelFor, HIP>; + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + } + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value(m_functor, m_team_size)); + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + m_scratch_ptr[0] = nullptr; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); + } + + size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); + } + } + + ~ParallelFor() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp similarity index 61% rename from core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp rename to core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp index 0fa325cb12c..55b6218d1c8 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp @@ -14,157 +14,19 @@ // //@HEADER -#ifndef KOKKOS_HIP_PARALLEL_MDRANGE_HPP -#define KOKKOS_HIP_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP + +#include #include #include #include #include #include -#include namespace Kokkos { namespace Impl { -// ParallelFor -template -class ParallelFor, HIP> { - public: - using Policy = Kokkos::MDRangePolicy; - using functor_type = FunctorType; - - private: - using array_index_type = typename Policy::array_index_type; - using index_type = typename Policy::index_type; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - Kokkos::Impl::DeviceIterateTile(m_policy, - m_functor) - .exec_range(); - } - - inline void execute() const { - using ClosureType = ParallelFor; - if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); - if (Policy::rank == 2) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - 1); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 3) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], - m_policy.m_tile[2]); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2], m_policy.m_tile[3]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 - // to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; - // id4,id5 to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4] * m_policy.m_tile[5]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - } - - } // end execute - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - - template - static int max_tile_size_product(const Policy&, const Functor&) { - using closure_type = - ParallelFor, HIP>; - unsigned block_size = hip_get_max_blocksize(); - if (block_size == 0) - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " - "tile size.")); - return block_size; - } -}; // ParallelReduce template diff --git a/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp new file mode 100644 index 00000000000..c8981866e8a --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp @@ -0,0 +1,329 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, + Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + public: + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; + using functor_type = FunctorType; + using reducer_type = ReducerType; + using size_type = Kokkos::HIP::size_type; + using index_type = typename Policy::index_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t, size_type>; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + word_size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + + static constexpr bool UseShflReduction = false; + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Make the exec_range calls call to Reduce::DeviceIterateTile + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(i, update); + } + + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), i, update); + } + + public: + __device__ inline void operator()() const { + using ReductionTag = std::conditional_t; + run(ReductionTag{}); + } + + __device__ inline void run(SHMEMReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + const integral_nonzero_constant + word_count(reducer.value_size() / sizeof(word_size_type)); + + { + reference_type value = reducer.init(reinterpret_cast( + ::Kokkos::kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically + // equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + } + + // Reduce with final value at blockDim.y - 1 location. + // Shortcut for length zero reduction + bool do_final_reduction = m_policy.begin() == m_policy.end(); + if (!do_final_reduction) + do_final_reduction = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + ::Kokkos::kokkos_impl_hip_shared_memory(), + m_scratch_space, m_scratch_flags); + if (do_final_reduction) { + // This is the final block with the final result at the final threads' + // location + + word_size_type* const shared = + ::Kokkos::kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically equivalent. + + WorkRange const range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + + pointer_type const result = reinterpret_cast(m_scratch_space); + + int max_active_thread = static_cast(range.end() - range.begin()) < + static_cast(blockDim.y) + ? range.end() - range.begin() + : blockDim.y; + + max_active_thread = + (max_active_thread == 0) ? blockDim.y : max_active_thread; + + value_type init; + reducer.init(&init); + if (m_policy.begin() == m_policy.end()) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } else if (Impl::hip_inter_block_shuffle_reduction<>( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, max_active_thread)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } + } + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + const auto& instance = m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem(f, n); + }; + return Kokkos::Impl::hip_get_preferred_blocksize( + instance, shmem_functor); + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const index_type nwork = m_policy.end() - m_policy.begin(); + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + !std::is_same::value; + if ((nwork > 0) || need_device_set) { + const int block_size = local_block_size(m_functor_reducer.get_functor()); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid execution configuration.")); + } + + // REQUIRED ( 1 , N , 1 ) + dim3 block(1, block_size, 1); + // use a slightly less constrained, but still well bounded limit for + // scratch + int nblocks = (nwork + block.y - 1) / block.y; + // Heuristic deciding the value of nblocks. + // The general idea here is we want to: + // 1. Not undersubscribe the device (i.e., we want at least + // preferred_block_min blocks) + // 2. Have each thread reduce > 1 value to minimize overheads + // 3. Limit the total # of blocks, to avoid unbounded scratch space + constexpr int block_max = 4096; + constexpr int preferred_block_min = 1024; + + if (nblocks < preferred_block_min) { + // keep blocks as is, already have low parallelism + } else if (nblocks > block_max) { + // "large dispatch" -> already have lots of parallelism + nblocks = block_max; + } else { + // in the intermediate range, try to have each thread process multiple + // items to offset the cost of the reduction (with not enough + // parallelism to hide it) + int items_per_thread = + (nwork + nblocks * block_size - 1) / (nblocks * block_size); + if (items_per_thread < 4) { + int ratio = std::min( + (nblocks + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + nblocks /= ratio; + } + } + + // TODO: down casting these uses more space than required? + m_scratch_space = + (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * nblocks); + // Intentionally do not downcast to word_size_type since we use HIP + // atomics in Kokkos_HIP_ReduceScan.hpp + m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( + m_policy.space(), sizeof(size_type)); + // Required grid.x <= block.y + dim3 grid(nblocks, 1, 1); + + if (nwork == 0) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + m_functor_reducer.get_functor(), block.y); + + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible && m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp new file mode 100644 index 00000000000..f2198902e92 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -0,0 +1,393 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, HIP> { + public: + using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; + + public: + using functor_type = FunctorType; + using size_type = HIP::size_type; + + // static int constexpr UseShflReduction = false; + // FIXME_HIP This should be disabled unconditionally for best performance, but + // it currently causes tests to fail. + static constexpr int UseShflReduction = + (ReducerType::static_value_size() != 0); + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_team_begin; + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(member, update); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), member, update); + } + + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team( + member_type( + kokkos_impl_hip_shared_memory() + m_team_begin, + m_shmem_begin, m_shmem_size, + reinterpret_cast( + reinterpret_cast(m_scratch_ptr[1]) + + static_cast(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + } + + int compute_block_count() const { + constexpr auto light_weight = + Kokkos::Experimental::WorkItemProperty::HintLightWeight; + constexpr typename Policy::work_item_property property; + // Numbers were tuned on MI210 using dot product and yAx benchmarks + constexpr int block_max = + (property & light_weight) == light_weight ? 2097152 : 65536; + constexpr int preferred_block_min = 1024; + int block_count = m_league_size; + if (block_count < preferred_block_min) { + // keep blocks as is, already low parallelism + } else if (block_count >= block_max) { + block_count = block_max; + + } else { + int nwork = m_league_size * m_team_size; + int items_per_thread = + (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); + if (items_per_thread < 4) { + int ratio = std::min( + (block_count + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + block_count /= ratio; + } + } + + return block_count; + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + using ReductionTag = std::conditional_t; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(size_type)); + + reference_type value = + reducer.init(kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value); + // Iterate this block through the league + iterate_through_league(threadid, value); + + // Reduce with final value at blockDim.y - 1 location. + bool do_final_reduce = (m_league_size == 0); + if (!do_final_reduce) + do_final_reduce = + hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); + if (do_final_reduce) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + + // Iterate this block through the league + iterate_through_league(threadid, value); + + pointer_type const result = + m_result_ptr_device_accessible + ? m_result_ptr + : reinterpret_cast(m_scratch_space); + + value_type init; + reducer.init(&init); + if (m_league_size == 0) { + reducer.final(&value); + *result = value; + } else if (Impl::hip_inter_block_shuffle_reduction( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, blockDim.y)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + *result = value; + } + } + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const bool is_empty_range = m_league_size == 0 || m_team_size == 0; + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + Policy::is_graph_kernel::value || + !std::is_same::value; + if (!is_empty_range || need_device_set) { + int const block_count = compute_block_count(); + + m_scratch_space = hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count); + m_scratch_flags = + hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + + dim3 block(m_vector_size, m_team_size, 1); + dim3 grid(block_count, 1, 1); + if (is_empty_range) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().impl_internal_space_instance()->fence(); + + if (m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_result_ptr, m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + + m_team_begin = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + arg_functor_reducer.get_functor(), m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for HIP for dynamic " + "sized reduction types."); + + if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 64 is not currently supported with HIP for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); + } + + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " + "L0 scratch memory")); + } + + size_t max_size = arg_policy.team_size_max( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " + "large team size.")); + } + } + + ~ParallelReduce() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp similarity index 50% rename from core/src/HIP/Kokkos_HIP_Parallel_Range.hpp rename to core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp index 7356dd2d305..41692a3291b 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp @@ -14,390 +14,18 @@ // //@HEADER -#ifndef KOKKOS_HIP_PARALLEL_RANGE_HPP -#define KOKKOS_HIP_PARALLEL_RANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP #include -#if defined(__HIPCC__) - #include #include #include -#include -#include namespace Kokkos { namespace Impl { -template -class ParallelFor, Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - - private: - using Member = typename Policy::member_type; - using WorkTag = typename Policy::work_tag; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(i); - } - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(TagType(), i); - } - - public: - using functor_type = FunctorType; - - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - const Member work_stride = blockDim.y * gridDim.x; - const Member work_end = m_policy.end(); - - for (Member iwork = - m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; - iwork < work_end; - iwork = iwork < work_end - work_stride ? iwork + work_stride - : work_end) { - this->template exec_range(iwork); - } - } - - inline void execute() const { - const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); - - using DriverType = ParallelFor; - const int block_size = - Kokkos::Impl::hip_get_preferred_blocksize(); - const dim3 block(1, block_size, 1); - const dim3 grid( - typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " - "valid execution configuration.")); - } - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), - false); - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, - Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using WorkRange = typename Policy::WorkRange; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using LaunchBounds = typename Policy::launch_bounds; - - public: - using pointer_type = typename ReducerType::pointer_type; - using value_type = typename ReducerType::value_type; - using reference_type = typename ReducerType::reference_type; - using functor_type = FunctorType; - using reducer_type = ReducerType; - using size_type = Kokkos::HIP::size_type; - using index_type = typename Policy::index_type; - // Conditionally set word_size_type to int16_t or int8_t if value_type is - // smaller than int32_t (Kokkos::HIP::size_type) - // word_size_type is used to determine the word count, shared memory buffer - // size, and global memory buffer size before the scan is performed. - // Within the scan, the word count is recomputed based on word_size_type - // and when calculating indexes into the shared/global memory buffers for - // performing the scan, word_size_type is used again. - // For scalars > 4 bytes in size, indexing into shared/global memory relies - // on the block and grid dimensions to ensure that we index at the correct - // offset rather than at every 4 byte word; such that, when the join is - // performed, we have the correct data that was copied over in chunks of 4 - // bytes. - using word_size_type = std::conditional_t< - sizeof(value_type) < sizeof(size_type), - std::conditional_t, size_type>; - - // Algorithmic constraints: blockSize is a power of two AND blockDim.y == - // blockDim.z == 1 - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - word_size_type* m_scratch_space = nullptr; - size_type* m_scratch_flags = nullptr; - - static constexpr bool UseShflReduction = false; - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Make the exec_range calls call to Reduce::DeviceIterateTile - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(i, update); - } - - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), i, update); - } - - public: - __device__ inline void operator()() const { - using ReductionTag = std::conditional_t; - run(ReductionTag{}); - } - - __device__ inline void run(SHMEMReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - const integral_nonzero_constant - word_count(reducer.value_size() / sizeof(word_size_type)); - - { - reference_type value = reducer.init(reinterpret_cast( - ::Kokkos::kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value)); - - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of - // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically - // equivalent. - - const WorkRange range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - } - - // Reduce with final value at blockDim.y - 1 location. - // Shortcut for length zero reduction - bool do_final_reduction = m_policy.begin() == m_policy.end(); - if (!do_final_reduction) - do_final_reduction = hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - ::Kokkos::kokkos_impl_hip_shared_memory(), - m_scratch_space, m_scratch_flags); - if (do_final_reduction) { - // This is the final block with the final result at the final threads' - // location - - word_size_type* const shared = - ::Kokkos::kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - word_size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of work - // to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically equivalent. - - WorkRange const range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - - pointer_type const result = reinterpret_cast(m_scratch_space); - - int max_active_thread = static_cast(range.end() - range.begin()) < - static_cast(blockDim.y) - ? range.end() - range.begin() - : blockDim.y; - - max_active_thread = - (max_active_thread == 0) ? blockDim.y : max_active_thread; - - value_type init; - reducer.init(&init); - if (m_policy.begin() == m_policy.end()) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } else if (Impl::hip_inter_block_shuffle_reduction<>( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, max_active_thread)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } - } - } - - // Determine block size constrained by shared memory: - inline unsigned local_block_size(const FunctorType& f) { - const auto& instance = m_policy.space().impl_internal_space_instance(); - auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem(f, n); - }; - return Kokkos::Impl::hip_get_preferred_blocksize( - instance, shmem_functor); - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const index_type nwork = m_policy.end() - m_policy.begin(); - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - !std::is_same::value; - if ((nwork > 0) || need_device_set) { - const int block_size = local_block_size(m_functor_reducer.get_functor()); - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " - "valid execution configuration.")); - } - - // REQUIRED ( 1 , N , 1 ) - dim3 block(1, block_size, 1); - // use a slightly less constrained, but still well bounded limit for - // scratch - int nblocks = (nwork + block.y - 1) / block.y; - // Heuristic deciding the value of nblocks. - // The general idea here is we want to: - // 1. Not undersubscribe the device (i.e., we want at least - // preferred_block_min blocks) - // 2. Have each thread reduce > 1 value to minimize overheads - // 3. Limit the total # of blocks, to avoid unbounded scratch space - constexpr int block_max = 4096; - constexpr int preferred_block_min = 1024; - - if (nblocks < preferred_block_min) { - // keep blocks as is, already have low parallelism - } else if (nblocks > block_max) { - // "large dispatch" -> already have lots of parallelism - nblocks = block_max; - } else { - // in the intermediate range, try to have each thread process multiple - // items to offset the cost of the reduction (with not enough - // parallelism to hide it) - int items_per_thread = - (nwork + nblocks * block_size - 1) / (nblocks * block_size); - if (items_per_thread < 4) { - int ratio = std::min( - (nblocks + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - nblocks /= ratio; - } - } - - // TODO: down casting these uses more space than required? - m_scratch_space = - (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * nblocks); - // Intentionally do not downcast to word_size_type since we use HIP - // atomics in Kokkos_HIP_ReduceScan.hpp - m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( - m_policy.space(), sizeof(size_type)); - // Required grid.x <= block.y - dim3 grid(nblocks, 1, 1); - - if (nwork == 0) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - m_functor_reducer.get_functor(), block.y); - - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute - - if (!m_result_ptr_device_accessible && m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_policy.space(), m_result_ptr, - m_scratch_space, size); - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - const Policy& arg_policy, const ViewType& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible) {} -}; - template class ParallelScanHIPBase { public: @@ -763,5 +391,3 @@ class ParallelScanWithTotal, } // namespace Kokkos #endif - -#endif diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp deleted file mode 100644 index 17b4597f988..00000000000 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ /dev/null @@ -1,936 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_HIP_PARALLEL_TEAM_HPP -#define KOKKOS_HIP_PARALLEL_TEAM_HPP - -#include - -#if defined(__HIPCC__) - -#include -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class TeamPolicyInternal - : public PolicyTraits { - public: - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits; - - template - friend class TeamPolicyInternal; - - private: - typename traits::execution_space m_space; - int m_league_size; - int m_team_size; - int m_vector_length; - size_t m_team_scratch_size[2]; - size_t m_thread_scratch_size[2]; - int m_chunk_size; - bool m_tune_team_size; - bool m_tune_vector_length; - - public: - using execution_space = HIP; - - template - TeamPolicyInternal(TeamPolicyInternal const& p) { - m_league_size = p.m_league_size; - m_team_size = p.m_team_size; - m_vector_length = p.m_vector_length; - m_team_scratch_size[0] = p.m_team_scratch_size[0]; - m_team_scratch_size[1] = p.m_team_scratch_size[1]; - m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; - m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; - m_chunk_size = p.m_chunk_size; - m_space = p.m_space; - m_tune_team_size = p.m_tune_team_size; - m_tune_vector_length = p.m_tune_vector_length; - } - - template - int team_size_max(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common(f); - } - - template - inline int team_size_max(const FunctorType& f, - const ParallelReduceTag&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Max, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - inline int team_size_max(const FunctorType& f, const ReducerType&, - const ParallelReduceTag&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - template - int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common( - f); - } - - template - inline int team_size_recommended(FunctorType const& f, - ParallelReduceTag const&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Preferred, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - int team_size_recommended(FunctorType const& f, ReducerType const&, - ParallelReduceTag const&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - static int vector_length_max() { return HIPTraits::WarpSize; } - - static int verify_requested_vector_length(int requested_vector_length) { - int test_vector_length = - std::min(requested_vector_length, vector_length_max()); - - // Allow only power-of-two vector_length - if (!(is_integral_power_of_two(test_vector_length))) { - int test_pow2 = 1; - constexpr int warp_size = HIPTraits::WarpSize; - while (test_pow2 < warp_size) { - test_pow2 <<= 1; - if (test_pow2 > test_vector_length) { - break; - } - } - test_vector_length = test_pow2 >> 1; - } - - return test_vector_length; - } - - inline static int scratch_size_max(int level) { - // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team - // reductions. They also use one int64_t in static shared memory for a - // shared ID. Furthermore, they use additional scratch memory in some - // reduction scenarios, which depend on the size of the value_type and is - // NOT captured here - constexpr size_t max_possible_team_size = 1024; - constexpr size_t max_reserved_shared_mem_per_team = - (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); - // arbitrarily setting level 1 scratch limit to 20MB, for a - // MI250 that would give us about 4.4GB for 2 teams per CU - constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; - - size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; - return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team - : max_l1_scratch_size); - } - - inline void impl_set_vector_length(size_t size) { m_vector_length = size; } - inline void impl_set_team_size(size_t size) { m_team_size = size; } - int impl_vector_length() const { return m_vector_length; } - - int team_size() const { return m_team_size; } - - int league_size() const { return m_league_size; } - - size_t scratch_size(int level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - size_t team_scratch_size(int level) const { - return m_team_scratch_size[level]; - } - - size_t thread_scratch_size(int level) const { - return m_thread_scratch_size[level]; - } - - typename traits::execution_space space() const { return m_space; } - - TeamPolicyInternal() - : m_space(typename traits::execution_space()), - m_league_size(0), - m_team_size(-1), - m_vector_length(0), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(false), - m_tune_vector_length(false) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length( - (vector_length_request > 0) - ? verify_requested_vector_length(vector_length_request) - : (verify_requested_vector_length(1))), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(bool(team_size_request <= 0)), - m_tune_vector_length(bool(vector_length_request <= 0)) { - // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - - // Make sure total block size is permissible - if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} - // FLAG - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - ) - : TeamPolicyInternal(space_, league_size_, team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(space_, league_size_, -1, -1) - - {} - - TeamPolicyInternal(int league_size_, int team_size_request, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, vector_length_request) {} - - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - vector_length_request) {} - - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(int league_size_, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - -1) {} - - int chunk_size() const { return m_chunk_size; } - - TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerTeamValue const& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerThreadValue const& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, - PerThreadValue const& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - using member_type = Kokkos::Impl::HIPTeamMember; - - protected: - template - int internal_team_size_common(FunctorType const& f) const { - const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); - unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); - using Tag = typename PatternTagFromImplSpecialization::type; - if constexpr (std::is_same_v) { - using Interface = - typename Impl::DeduceFunctorPatternInterface::type; - using Analysis = - Impl::FunctorAnalysis; - shmem_thread += - ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); - } - const int vector_length = impl_vector_length(); - - const auto functor = [&f, shmem_block, shmem_thread, vector_length]( - const hipFuncAttributes& attr, int block_size) { - int functor_shmem = - ::Kokkos::Impl::FunctorTeamShmemSize::value( - f, block_size / vector_length); - return shmem_block + shmem_thread * (block_size / vector_length) + - functor_shmem + attr.sharedSizeBytes; - }; - int block_size; - if constexpr (BlockSize == BlockType::Max) { - block_size = hip_get_max_team_blocksize( - space().impl_internal_space_instance(), functor); - } else { - block_size = - hip_get_preferred_team_blocksize( - space().impl_internal_space_instance(), functor); - } - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " - "team size.")); - } - if constexpr (std::is_same_v) { - return block_size / impl_vector_length(); - } else { - // Currently we require Power-of-2 team size for reductions. - int p2 = 1; - while (p2 <= block_size) p2 *= 2; - p2 /= 2; - return p2 / impl_vector_length(); - } - } -}; - -__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, - int32_t* scratch_locks, - size_t num_scratch_locks) { - int64_t threadid = 0; - __shared__ int64_t base_thread_id; - if (threadIdx.x == 0 && threadIdx.y == 0) { - int64_t const wraparound_len = - Kokkos::min(int64_t(league_size), - int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); - threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; - threadid *= blockDim.x * blockDim.y; - int done = 0; - while (!done) { - done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); - if (!done) { - threadid += blockDim.x * blockDim.y; - if (int64_t(threadid + blockDim.x * blockDim.y) >= - wraparound_len * blockDim.x * blockDim.y) - threadid = 0; - } - } - base_thread_id = threadid; - } - __syncthreads(); - threadid = base_thread_id; - return threadid; -} - -__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, - int64_t threadid) { - __syncthreads(); - if (threadIdx.x == 0 && threadIdx.y == 0) { - scratch_locks[threadid] = 0; - } -} - -template -class ParallelFor, HIP> { - public: - using Policy = TeamPolicy; - using functor_type = FunctorType; - using size_type = HIP::size_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ team reduce space ] - // [ team shared space ] - - FunctorType const m_functor; - Policy const m_policy; - size_type const m_league_size; - int m_team_size; - size_type const m_vector_size; - int m_shmem_begin; - int m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(member); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(TagType(), member); - } - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - __device__ inline void operator()() const { - // Iterate this block through the league - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team(typename Policy::member_type( - kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, - static_cast(static_cast(m_scratch_ptr[1]) + - ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size)); - } - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - inline void execute() const { - int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; - dim3 const grid(static_cast(m_league_size), 1, 1); - dim3 const block(static_cast(m_vector_size), - static_cast(m_team_size), 1); - - using closure_type = - ParallelFor, HIP>; - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - } - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); - - m_shmem_begin = (sizeof(double) * (m_team_size + 2)); - m_shmem_size = - (m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(m_functor, m_team_size)); - m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - m_scratch_ptr[0] = nullptr; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); - } - - size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); - } - } - - ~ParallelFor() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, HIP> { - public: - using Policy = TeamPolicyInternal; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - using value_type = typename ReducerType::value_type; - - public: - using functor_type = FunctorType; - using size_type = HIP::size_type; - - // static int constexpr UseShflReduction = false; - // FIXME_HIP This should be disabled unconditionally for best performance, but - // it currently causes tests to fail. - static constexpr int UseShflReduction = - (ReducerType::static_value_size() != 0); - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ global reduce space ] - // [ team reduce space ] - // [ team shared space ] - // - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - size_type* m_scratch_space; - size_type* m_scratch_flags; - size_type m_team_begin; - size_type m_shmem_begin; - size_type m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - const size_type m_league_size; - int m_team_size; - const size_type m_vector_size; - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(member, update); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), member, update); - } - - __device__ inline void iterate_through_league(int const threadid, - reference_type value) const { - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team( - member_type( - kokkos_impl_hip_shared_memory() + m_team_begin, - m_shmem_begin, m_shmem_size, - reinterpret_cast( - reinterpret_cast(m_scratch_ptr[1]) + - static_cast(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size), - value); - } - } - - int compute_block_count() const { - constexpr auto light_weight = - Kokkos::Experimental::WorkItemProperty::HintLightWeight; - constexpr typename Policy::work_item_property property; - // Numbers were tuned on MI210 using dot product and yAx benchmarks - constexpr int block_max = - (property & light_weight) == light_weight ? 2097152 : 65536; - constexpr int preferred_block_min = 1024; - int block_count = m_league_size; - if (block_count < preferred_block_min) { - // keep blocks as is, already low parallelism - } else if (block_count >= block_max) { - block_count = block_max; - - } else { - int nwork = m_league_size * m_team_size; - int items_per_thread = - (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); - if (items_per_thread < 4) { - int ratio = std::min( - (block_count + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - block_count /= ratio; - } - } - - return block_count; - } - - public: - __device__ inline void operator()() const { - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - using ReductionTag = std::conditional_t; - run(ReductionTag{}, threadid); - - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - __device__ inline void run(SHMEMReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - integral_nonzero_constant const - word_count(reducer.value_size() / sizeof(size_type)); - - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); - // Iterate this block through the league - iterate_through_league(threadid, value); - - // Reduce with final value at blockDim.y - 1 location. - bool do_final_reduce = (m_league_size == 0); - if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); - if (do_final_reduce) { - // This is the final block with the final result at the final threads' - // location - - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - - // Iterate this block through the league - iterate_through_league(threadid, value); - - pointer_type const result = - m_result_ptr_device_accessible - ? m_result_ptr - : reinterpret_cast(m_scratch_space); - - value_type init; - reducer.init(&init); - if (m_league_size == 0) { - reducer.final(&value); - *result = value; - } else if (Impl::hip_inter_block_shuffle_reduction( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, blockDim.y)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - *result = value; - } - } - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const bool is_empty_range = m_league_size == 0 || m_team_size == 0; - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - Policy::is_graph_kernel::value || - !std::is_same::value; - if (!is_empty_range || need_device_set) { - int const block_count = compute_block_count(); - - m_scratch_space = hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * block_count); - m_scratch_flags = - hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - - dim3 block(m_vector_size, m_team_size, 1); - dim3 grid(block_count, 1, 1); - if (is_empty_range) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - m_policy.space().impl_internal_space_instance()->fence(); - - if (m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_result_ptr, m_scratch_space, size); - } - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_team_begin(0), - m_shmem_begin(0), - m_shmem_size(0), - m_scratch_ptr{nullptr, nullptr}, - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - - m_team_begin = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - arg_functor_reducer.get_functor(), m_team_size); - m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value( - arg_functor_reducer.get_functor(), m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - // The global parallel_reduce does not support vector_length other than 1 at - // the moment - if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " - "greater than 1 is not currently supported for HIP for dynamic " - "sized reduction types."); - - if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " - "than 64 is not currently supported with HIP for dynamic sized " - "reduction types."); - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && - !UseShflReduction) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); - } - - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " - "L0 scratch memory")); - } - - size_t max_size = arg_policy.team_size_max( - arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " - "large team size.")); - } - } - - ~ParallelReduce() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; -} // namespace Impl -} // namespace Kokkos - -#endif - -#endif diff --git a/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp new file mode 100644 index 00000000000..984fa9d3d2d --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -0,0 +1,421 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP +#define KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class TeamPolicyInternal + : public PolicyTraits { + public: + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits; + + template + friend class TeamPolicyInternal; + + private: + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; + + public: + using execution_space = HIP; + + template + TeamPolicyInternal(TeamPolicyInternal const& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + template + int team_size_max(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common(f); + } + + template + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Max, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + inline int team_size_max(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + template + int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common( + f); + } + + template + inline int team_size_recommended(FunctorType const& f, + ParallelReduceTag const&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Preferred, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + int team_size_recommended(FunctorType const& f, ReducerType const&, + ParallelReduceTag const&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } + static int vector_length_max() { return HIPTraits::WarpSize; } + + static int verify_requested_vector_length(int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + constexpr int warp_size = HIPTraits::WarpSize; + while (test_pow2 < warp_size) { + test_pow2 <<= 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + inline static int scratch_size_max(int level) { + // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team + // reductions. They also use one int64_t in static shared memory for a + // shared ID. Furthermore, they use additional scratch memory in some + // reduction scenarios, which depend on the size of the value_type and is + // NOT captured here + constexpr size_t max_possible_team_size = 1024; + constexpr size_t max_reserved_shared_mem_per_team = + (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); + // arbitrarily setting level 1 scratch limit to 20MB, for a + // MI250 that would give us about 4.4GB for 2 teams per CU + constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; + + size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; + return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team + : max_l1_scratch_size); + } + + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + + int team_size() const { return m_team_size; } + + int league_size() const { return m_league_size; } + + size_t scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + size_t team_scratch_size(int level) const { + return m_team_scratch_size[level]; + } + + size_t thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + typename traits::execution_space space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible + if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + Impl::throw_runtime_exception( + "Requested too large league_size for TeamPolicy on HIP execution " + "space."); + + // Make sure total block size is permissible + if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { + Impl::throw_runtime_exception( + std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " + "Team size x vector length must be smaller than 1024.")); + } + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} + + int chunk_size() const { return m_chunk_size; } + + TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerTeamValue const& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerThreadValue const& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, + PerThreadValue const& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::HIPTeamMember; + + protected: + template + int internal_team_size_common(FunctorType const& f) const { + const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); + unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); + using Tag = typename PatternTagFromImplSpecialization::type; + if constexpr (std::is_same_v) { + using Interface = + typename Impl::DeduceFunctorPatternInterface::type; + using Analysis = + Impl::FunctorAnalysis; + shmem_thread += + ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); + } + const int vector_length = impl_vector_length(); + + const auto functor = [&f, shmem_block, shmem_thread, vector_length]( + const hipFuncAttributes& attr, int block_size) { + int functor_shmem = + ::Kokkos::Impl::FunctorTeamShmemSize::value( + f, block_size / vector_length); + return shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + }; + int block_size; + if constexpr (BlockSize == BlockType::Max) { + block_size = hip_get_max_team_blocksize( + space().impl_internal_space_instance(), functor); + } else { + block_size = + hip_get_preferred_team_blocksize( + space().impl_internal_space_instance(), functor); + } + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " + "team size.")); + } + if constexpr (std::is_same_v) { + return block_size / impl_vector_length(); + } else { + // Currently we require Power-of-2 team size for reductions. + int p2 = 1; + while (p2 <= block_size) p2 *= 2; + p2 /= 2; + return p2 / impl_vector_length(); + } + } +}; + +__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, + int32_t* scratch_locks, + size_t num_scratch_locks) { + int64_t threadid = 0; + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + int64_t const wraparound_len = + Kokkos::min(int64_t(league_size), + int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + wraparound_len * blockDim.x * blockDim.y) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + return threadid; +} + +__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, + int64_t threadid) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + scratch_locks[threadid] = 0; + } +} + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/decl/Kokkos_Declare_HIP.hpp b/core/src/decl/Kokkos_Declare_HIP.hpp index e115f7051f3..cf405e57b8f 100644 --- a/core/src/decl/Kokkos_Declare_HIP.hpp +++ b/core/src/decl/Kokkos_Declare_HIP.hpp @@ -25,9 +25,13 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include From 400dd1d999d4fd669f7af314e82072446b4f7aca Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 30 Oct 2023 11:06:30 -0400 Subject: [PATCH 101/432] Trim some fat in `CudaInternal` (towards multiple GPUs support) (#6544) * Remove unused Impl::cuda_internal_maximum_concurrent_block_count function * Drop unused CudaInternal::m_shmemPerSM * Drop unused CudaInternal::m_maxBlocksPerSM * Drop unused CudaInternal::m_maxThreadsPerBlock * Drop (unused) Impl::cuda_internal_maximum_warp_count() * Drop CudaInternal::m_maxWarpCount data member * Drop CudaInternal::m_multiProcCount data member * Drop CudaInternal::m_maxThreadsPerSM data member * Drop CudaInternal::m_maxBlock data member * Get rid of Impl::cuda_internal_maximum_grid_count() * Drop CudaInternal::m_maxShmemPerBlock static data member * Drop Impl::cuda_internal_multiprocessor_count() * Drop (unused) CudaTraits::warp_{count,align} static member functions * Fixup exception msg * Fixup tasking get exec space from teh scheduler * Try to get to the device properties some othe eay in cuda tasking --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 65 ++----------------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 27 +------- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 8 +-- core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp | 4 +- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 8 +-- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 24 +++---- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 14 ++-- core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 3 +- core/src/Cuda/Kokkos_Cuda_Task.hpp | 14 ++-- core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp | 4 +- 10 files changed, 53 insertions(+), 118 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index d7f853d9910..c36650263e3 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -383,8 +383,13 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { // Allocate some initial space. This will grow as needed. { - const unsigned reduce_block_count = - m_maxWarpCount * Impl::CudaTraits::WarpSize; + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + auto const maxWarpCount = std::min( + m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize, + CudaTraits::WarpSize); + unsigned const reduce_block_count = + maxWarpCount * Impl::CudaTraits::WarpSize; (void)scratch_unified(16 * sizeof(size_type)); (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); @@ -624,30 +629,6 @@ void CudaInternal::finalize() { //---------------------------------------------------------------------------- -Cuda::size_type cuda_internal_multiprocessor_count() { - return CudaInternal::singleton().m_multiProcCount; -} - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() { -#if defined(KOKKOS_ARCH_KEPLER) - // Compute capability 3.0 through 3.7 - enum : int { max_resident_blocks_per_multiprocessor = 16 }; -#else - // Compute capability 5.0 through 6.2 - enum : int { max_resident_blocks_per_multiprocessor = 32 }; -#endif - return CudaInternal::singleton().m_multiProcCount * - max_resident_blocks_per_multiprocessor; -}; - -Cuda::size_type cuda_internal_maximum_warp_count() { - return CudaInternal::singleton().m_maxWarpCount; -} - -std::array cuda_internal_maximum_grid_count() { - return CudaInternal::singleton().m_maxBlock; -} - Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance, const std::size_t size) { return instance.impl_internal_space_instance()->scratch_space(size); @@ -761,38 +742,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default } #endif - //---------------------------------- - // number of multiprocessors - Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount; - - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::CudaInternal::m_maxWarpCount = - cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize; - - if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) { - Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize; - } - - //---------------------------------- - // Maximum number of blocks: - - Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0]; - Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1]; - Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2]; - - Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor; - Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock; - Impl::CudaInternal::m_maxBlocksPerSM = - Impl::CudaInternal::m_cudaArch < 500 - ? 16 - : (Impl::CudaInternal::m_cudaArch < 750 - ? 32 - : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32)); - Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor; - Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock; - //---------------------------------- cudaStream_t singleton_stream; diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index a324adecfeb..af45a787303 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -55,27 +55,10 @@ struct CudaTraits { unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( - CudaSpace::size_type i) { - return (i + WarpIndexMask) >> WarpIndexShift; - } - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align( - CudaSpace::size_type i) { - constexpr CudaSpace::size_type Mask = ~WarpIndexMask; - return (i + WarpIndexMask) & Mask; - } }; //---------------------------------------------------------------------------- -CudaSpace::size_type cuda_internal_multiprocessor_count(); -CudaSpace::size_type cuda_internal_maximum_warp_count(); -std::array cuda_internal_maximum_grid_count(); - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count(); - CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&, const std::size_t size); CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&, @@ -104,15 +87,7 @@ class CudaInternal { inline static int m_cudaDev = -1; // Device Properties - inline static int m_cudaArch = -1; - inline static unsigned m_multiProcCount = 0; - inline static unsigned m_maxWarpCount = 0; - inline static std::array m_maxBlock = {0, 0, 0}; - inline static int m_shmemPerSM = 0; - inline static int m_maxShmemPerBlock = 0; - inline static int m_maxBlocksPerSM = 0; - inline static int m_maxThreadsPerSM = 0; - inline static int m_maxThreadsPerBlock = 0; + inline static int m_cudaArch = -1; static int concurrency(); inline static cudaDeviceProp m_deviceProp; diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 82a72b69021..b68eec13a01 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -21,7 +21,6 @@ #ifdef KOKKOS_ENABLE_CUDA #include -#include #include #include #include @@ -118,10 +117,11 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { } inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { + int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock; + if (maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( - std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" - " memory request is too large")); + "CudaParallelLaunch (or graph node creation) FAILED: shared memory " + "request is too large"); } } diff --git a/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp index 7492ab49e56..2c7eba7a18f 100644 --- a/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp +++ b/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -40,8 +40,8 @@ template <> inline TileSizeProperties get_tile_size_properties( const Kokkos::Cuda& space) { TileSizeProperties properties; - properties.max_threads = - space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.max_threads = space.impl_internal_space_instance() + ->m_deviceProp.maxThreadsPerMultiProcessor; properties.default_largest_tile_size = 16; properties.default_tile_size = 2; properties.max_total_tile_size = 512; diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 8aae27d091f..13feed64e3e 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -96,7 +96,7 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = cuda_internal_maximum_grid_count(); + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); KOKKOS_ASSERT(block.x > 0); @@ -320,6 +320,8 @@ class ParallelReduce( f, n); @@ -330,9 +332,7 @@ class ParallelReduce::get_cuda_func_attributes(); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 5226c48bd9a..fcbd75c57f9 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -94,10 +94,10 @@ class ParallelFor, Kokkos::Cuda> { 0, 0); KOKKOS_ASSERT(block_size > 0); dim3 block(1, block_size, 1); + const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0]; dim3 grid( - std::min( - typename Policy::index_type((nwork + block.y - 1) / block.y), - typename Policy::index_type(cuda_internal_maximum_grid_count()[0])), + std::min(typename Policy::index_type((nwork + block.y - 1) / block.y), + typename Policy::index_type(maxGridSizeX)), 1, 1); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { @@ -254,6 +254,8 @@ class ParallelReduce, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { unsigned n = CudaTraits::WarpSize * 8; + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; int shmem_size = cuda_single_inter_block_reduce_scan_shmem( f, n); @@ -264,9 +266,7 @@ class ParallelReduce, CudaParallelLaunch::get_cuda_func_attributes(); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( @@ -609,11 +609,11 @@ class ParallelScan, Kokkos::Cuda> { // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; @@ -933,11 +933,11 @@ class ParallelScanWithTotal, // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 498e57f94a7..6724c91fcbc 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -262,7 +262,8 @@ class TeamPolicyInternal m_tune_team(bool(team_size_request <= 0)), m_tune_vector(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0])) + const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0]; + if (league_size_ >= maxGridSizeX) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution " "space."); @@ -575,10 +576,11 @@ class ParallelFor, static_cast(m_league_size)))); } + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; const int shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock, - shmem_size_total); + if (maxShmemPerBlock < shmem_size_total) { + printf("%i %i\n", maxShmemPerBlock, shmem_size_total); Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } @@ -935,6 +937,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (maxShmemPerBlock < shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much " "L0 scratch memory")); diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 7ccedbfe28d..3c0ade365ab 100644 --- a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy, unsigned reqShmemSize = cuda_single_inter_block_reduce_scan_shmem( functor, minBlockSize); - size_t maxShmemPerBlock = - policy.space().impl_internal_space_instance()->m_maxShmemPerBlock; + size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock; if (reqShmemSize > maxShmemPerBlock) { Kokkos::Impl::throw_runtime_exception( diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp index baff7ef3f55..19179ce5c0c 100644 --- a/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -84,8 +84,8 @@ class TaskQueueSpecialization> { KOKKOS_INLINE_FUNCTION static void iff_single_thread_recursive_execute(scheduler_type const&) {} - static int get_max_team_count(execution_space const&) { - return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block; + static int get_max_team_count(execution_space const& space) { + return space.cuda_device_prop().multiProcessorCount * warps_per_block; } __device__ static void driver(scheduler_type scheduler, @@ -225,7 +225,9 @@ class TaskQueueSpecialization> { // FIXME_CUDA_MULTIPLE_DEVICES static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const int multi_processor_count = + scheduler.get_execution_space().cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; const cudaStream_t stream = nullptr; @@ -466,7 +468,11 @@ class TaskQueueSpecializationConstrained< static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const int multi_processor_count = + // FIXME not sure why this didn't work + // scheduler.get_execution_space().cuda_device_prop().multiProcessorCount; + CudaInternal::singleton().m_deviceProp.multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); // const dim3 grid( 1 , 1 , 1 ); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; diff --git a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index a945a716bc3..c7ea6988a5d 100644 --- a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -77,7 +77,9 @@ class ParallelFor, inline void execute() { const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const int multi_processor_count = + m_policy.space().cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared = 0; From 3093a0e64bb30191cbafc705a70657929c74cfca Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 30 Oct 2023 14:42:50 -0400 Subject: [PATCH 102/432] Only define STDALGO_TEAM_SOURCES_* once --- algorithms/unit_tests/CMakeLists.txt | 291 +++++++++++++-------------- 1 file changed, 145 insertions(+), 146 deletions(-) diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt index 419f5ec1d13..7d5d0c67652 100644 --- a/algorithms/unit_tests/CMakeLists.txt +++ b/algorithms/unit_tests/CMakeLists.txt @@ -57,35 +57,37 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() + endif() +endforeach() - # ------------------------------------------ - # std set A - # ------------------------------------------ - set(STDALGO_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std set A +# ------------------------------------------ +set(STDALGO_SOURCES_A) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator - ) - list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set B - # ------------------------------------------ - set(STDALGO_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std set B +# ------------------------------------------ +set(STDALGO_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps - ) - list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set C - # ------------------------------------------ - set(STDALGO_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std set C +# ------------------------------------------ +set(STDALGO_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsLexicographicalCompare StdAlgorithmsForEach @@ -100,15 +102,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsSearch_n StdAlgorithmsMismatch StdAlgorithmsMoveBackward - ) - list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set D - # ------------------------------------------ - set(STDALGO_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std set D +# ------------------------------------------ +set(STDALGO_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsModOps StdAlgorithmsModSeqOps @@ -128,15 +130,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsReverse StdAlgorithmsShiftLeft StdAlgorithmsShiftRight - ) - list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set E - # ------------------------------------------ - set(STDALGO_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std set E +# ------------------------------------------ +set(STDALGO_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsIsSorted StdAlgorithmsIsSortedUntil @@ -149,83 +151,83 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTransformUnaryOp StdAlgorithmsTransformExclusiveScan StdAlgorithmsTransformInclusiveScan - ) - list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team Q - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_Q) - foreach(Name +# ------------------------------------------ +# std team Q +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_Q) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team P - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_P) - foreach(Name +# ------------------------------------------ +# std team P +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_P) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team M - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_M) - foreach(Name +# ------------------------------------------ +# std team M +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_M) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges - ) - list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team L - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_L) - foreach(Name +# ------------------------------------------ +# std team L +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_L) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint - ) - list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team I - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_I) - foreach(Name +# ------------------------------------------ +# std team I +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_I) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce StdAlgorithmsTeamTransformReduce - ) - list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team H - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_H) - foreach(Name +# ------------------------------------------ +# std team H +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_H) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamCopy StdAlgorithmsTeamCopy_n @@ -236,43 +238,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamRemoveIf StdAlgorithmsTeamRemoveCopy StdAlgorithmsTeamRemoveCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team G - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_G) - foreach(Name +# ------------------------------------------ +# std team G +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_G) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft StdAlgorithmsTeamShiftRight - ) - list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team F - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_F) - foreach(Name +# ------------------------------------------ +# std team F +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_F) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate StdAlgorithmsTeamRotateCopy - ) - list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team E - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std team E +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFill StdAlgorithmsTeamFill_n @@ -280,28 +282,28 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamReplaceIf StdAlgorithmsTeamReplaceCopy StdAlgorithmsTeamReplaceCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team D - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std team D +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement - ) - list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team C - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std team C +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFind StdAlgorithmsTeamFindIf @@ -310,29 +312,29 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamAnyOf StdAlgorithmsTeamNoneOf StdAlgorithmsTeamSearchN - ) - list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team B - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std team B +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd StdAlgorithmsTeamFindFirstOf - ) - list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team A - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std team A +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_A) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamAdjacentFind StdAlgorithmsTeamCount @@ -341,11 +343,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamForEachN StdAlgorithmsTeamLexicographicalCompare StdAlgorithmsTeamMismatch - ) - list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) - endforeach() - - endif() + ) + list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. From 6d95b621e110df87c488cb93a2f647d1a592af35 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 30 Oct 2023 17:21:05 -0400 Subject: [PATCH 103/432] Remove logical memory spaces --- core/src/Cuda/Kokkos_CudaSpace.hpp | 6 - core/src/HIP/Kokkos_HIP_Space.hpp | 6 - core/src/Kokkos_Core.hpp | 1 - core/src/Kokkos_Core_fwd.hpp | 6 - core/src/Kokkos_HBWSpace.hpp | 3 - core/src/Kokkos_HostSpace.hpp | 3 - core/src/Kokkos_LogicalSpaces.hpp | 413 --------------------- core/src/SYCL/Kokkos_SYCL_Space.hpp | 15 - core/unit_test/CMakeLists.txt | 8 +- core/unit_test/tools/TestLogicalSpaces.hpp | 177 --------- 10 files changed, 1 insertion(+), 637 deletions(-) delete mode 100644 core/src/Kokkos_LogicalSpaces.hpp delete mode 100644 core/unit_test/tools/TestLogicalSpaces.hpp diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp index b8fa335cd3b..ec532d23103 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -90,8 +90,6 @@ class CudaSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const Cuda& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -167,8 +165,6 @@ class CudaUVMSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -241,8 +237,6 @@ class CudaHostPinnedSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index f3e5adf87e5..a10d515f04f 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -76,8 +76,6 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -140,8 +138,6 @@ class HIPHostPinnedSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -209,8 +205,6 @@ class HIPManagedSpace { private: int m_device; ///< Which HIP device - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index 805411a699e..cde77dc3e90 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -46,7 +46,6 @@ #include #include -#include #include #include #include diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index 0c64352464a..7edb35f00eb 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -255,12 +255,6 @@ KOKKOS_FUNCTION void runtime_check_memory_access_violation( } } // namespace Impl - -namespace Experimental { -template -class LogicalMemorySpace; -} - } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp index 369b7bafb7b..f52362c12ab 100644 --- a/core/src/Kokkos_HBWSpace.hpp +++ b/core/src/Kokkos_HBWSpace.hpp @@ -83,9 +83,6 @@ class HBWSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 90d14040637..2cbc8e88f18 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -98,9 +98,6 @@ class HostSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = diff --git a/core/src/Kokkos_LogicalSpaces.hpp b/core/src/Kokkos_LogicalSpaces.hpp deleted file mode 100644 index 1ee1d2c81fe..00000000000 --- a/core/src/Kokkos_LogicalSpaces.hpp +++ /dev/null @@ -1,413 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_LOGICALSPACES_HPP -#define KOKKOS_LOGICALSPACES_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -namespace Kokkos { -namespace Experimental { -struct DefaultMemorySpaceNamer { - static constexpr const char* get_name() { - return "DefaultLogicalMemorySpaceName"; - } -}; - -struct LogicalSpaceSharesAccess { - struct shared_access {}; - struct no_shared_access {}; -}; - -/// \class LogicalMemorySpace -/// \brief -/// -/// LogicalMemorySpace is a space that is identical to another space, -/// but differentiable by name and template argument -template -class LogicalMemorySpace { -#ifdef KOKKOS_ENABLE_OPENMPTARGET - // [DZP] For some reason I don't yet know, using LogicalMemorySpaces - // inside an OpenMPTarget build causes errors in the - // SharedAllocationRecords of other types. This is my way of erroring - // a build if we instantiate a LogicalMemSpace in an OMPTarget build - static_assert(!std::is_same::value, - "Can't use LogicalMemorySpaces in an OpenMPTarget build, we're " - "debugging memory issues"); -#endif - public: - //! Tag this class as a kokkos memory space - using memory_space = LogicalMemorySpace; - using size_type = typename BaseSpace::size_type; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - - using execution_space = - std::conditional_t::value, - typename BaseSpace::execution_space, - DefaultBaseExecutionSpace>; - - using device_type = Kokkos::Device; - - LogicalMemorySpace() = default; - - template - LogicalMemorySpace(Args&&... args) : underlying_space((Args &&) args...) {} - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); - } - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); - } - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); - } - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); - } - - /**\brief Return Name of the MemorySpace */ - constexpr static const char* name() { return Namer::get_name(); } - - private: - BaseSpace underlying_space; - template - friend class LogicalMemorySpace; - friend class Kokkos::Impl::SharedAllocationRecord; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - return underlying_space.impl_allocate(arg_label, arg_alloc_size, - arg_logical_size, arg_handle); - } - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - underlying_space.impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, - arg_logical_size, arg_handle); - } -}; -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - OtherSpace> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - OtherSpace, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { -template -class SharedAllocationRecord, - void> : public SharedAllocationRecord { - private: - using SpaceType = - Kokkos::Experimental::LogicalMemorySpace; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase* arg_rec) { - delete static_cast(arg_rec); - } - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this - * LogicalMemorySpace instance */ - static RecordBase s_root_record; -#endif - - const SpaceType m_space; - - protected: - ~SharedAllocationRecord() { - m_space.deallocate(RecordBase::m_alloc_ptr->m_label, - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); - } - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const SpaceType& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast*>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; - } - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const SpaceType& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return (void*)nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); - } - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked: fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); - } - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } - } - - static SharedAllocationRecord* get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = SharedAllocationRecord; - - SharedAllocationHeader const* const head = - alloc_ptr ? Header::get_header(alloc_ptr) - : (SharedAllocationHeader*)nullptr; - RecordHost* const record = - head ? static_cast(head->m_record) : (RecordHost*)nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< LogicalMemorySpace<> , " - "void >::get_record ERROR")); - } - - return record; - } -#ifdef KOKKOS_ENABLE_DEBUG - static void print_records(std::ostream& s, const SpaceType&, - bool detail = false) { - SharedAllocationRecord::print_host_accessible_records( - s, "HostSpace", &s_root_record, detail); - } -#else - static void print_records(std::ostream&, const SpaceType&, - bool detail = false) { - (void)detail; - throw_runtime_exception( - "SharedAllocationRecord::print_records only works " - "with KOKKOS_ENABLE_DEBUG enabled"); - } -#endif -}; -#ifdef KOKKOS_ENABLE_DEBUG -/**\brief Root record for tracked allocations from this LogicalSpace - * instance */ -template -SharedAllocationRecord - SharedAllocationRecord, - void>::s_root_record; -#endif - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct DeepCopy, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - DestinationSpace, ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; -} // namespace Impl - -} // namespace Kokkos -#endif // KOKKOS_LOGICALSPACES_HPP diff --git a/core/src/SYCL/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp index 239c6e3ce0b..88d644205d1 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -66,11 +66,6 @@ class SYCLDeviceUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLDeviceUSM"; }; private: @@ -102,11 +97,6 @@ class SYCLSharedUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLSharedUSM"; }; private: @@ -138,11 +128,6 @@ class SYCLHostUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLHostUSM"; }; private: diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 413c699be04..eb2a1d014a2 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -1038,13 +1038,7 @@ KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate tools/TestCategoricalTuner.cpp ) endif() - if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_LogicalSpaces - SOURCES - tools/TestLogicalSpaces.cpp - ) - endif() + SET(KOKKOSP_SOURCES UnitTestMainInit.cpp tools/TestEventCorrectness.cpp diff --git a/core/unit_test/tools/TestLogicalSpaces.hpp b/core/unit_test/tools/TestLogicalSpaces.hpp deleted file mode 100644 index 4e56f8996a0..00000000000 --- a/core/unit_test/tools/TestLogicalSpaces.hpp +++ /dev/null @@ -1,177 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -#include -#include -#include "Kokkos_Core.hpp" - -#include - -namespace Test { - -void debug_print(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Alloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} -void debug_dealloc(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Dealloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} - -void fail_on_event(const Kokkos::Profiling::SpaceHandle, const char*, - const void*, const uint64_t) { - ASSERT_TRUE(false) << "Unexpected memory event"; -} - -void expect_no_events() { - Kokkos::Tools::Experimental::set_allocate_data_callback(&fail_on_event); - Kokkos::Tools::Experimental::set_deallocate_data_callback(&fail_on_event); -} - -std::string expected_view_name; -std::string expected_space_name; -std::string error_message; -void expect_allocation_event(const std::string evn, const std::string esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_allocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} -void expect_deallocation_event(const std::string& evn, const std::string& esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_deallocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} - -struct TestSpaceNamer { - static constexpr const char* get_name() { return "TestSpace"; } -}; -struct TestSpaceNamerTwo { - static constexpr const char* get_name() { return "YoDawg"; } -}; -struct TestSpaceNamerThree { - static constexpr const char* get_name() { return "CustomAccessSpace"; } -}; -using fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, TestSpaceNamer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - -void test_view_construct() { - { - expect_allocation_event("puppy_view", "TestSpace", "View allocation"); - Kokkos::View pup_view("puppy_view", 1000); - expect_deallocation_event("puppy_view", "TestSpace", "View free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_malloc_free() { - expect_allocation_event("does_malloc_work", "TestSpace", - "Error in malloc event"); - auto* temp = - Kokkos::kokkos_malloc("does_malloc_work", 1000); - expect_deallocation_event("does_malloc_work", "TestSpace", "Error in free"); - Kokkos::kokkos_free(temp); - Kokkos::Tools::Experimental::pause_tools(); -} -void test_chained_spaces() { - using doubly_fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - fake_memory_space, Kokkos::DefaultHostExecutionSpace, TestSpaceNamerTwo, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - { - expect_allocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space view allocation"); - Kokkos::View pup_view("xzibit_dot_jpeg", - 1000); - expect_deallocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_space_allocations() { - fake_memory_space debug_space; - expect_allocation_event("allocation_from_space", "TestSpace", - "Space allocation"); - auto* temp = debug_space.allocate("allocation_from_space", 1000); - expect_deallocation_event("allocation_from_space", "TestSpace", - "Space deallocation"); - debug_space.deallocate("allocation_from_space", temp, 1000); - Kokkos::Tools::Experimental::pause_tools(); -} -template -struct AccessCheckKernel { - Kokkos::View data; - KOKKOS_FUNCTION void operator()(const int i) const { data[i] = i; } -}; - -template -void test_allowed_access() { - constexpr const int data_size = 1000; - // We use an unmananged View here since we want to detect a memory access - // violation in the parallel_for and not in the initialization of the View. - std::vector test_data(data_size); - Kokkos::View test_view(test_data.data(), data_size); - AccessCheckKernel functor{test_view}; - Kokkos::parallel_for( - "access_allowed", - Kokkos::RangePolicy(0, data_size), - functor); - Kokkos::fence(); -} - -using semantically_independent_logical_space = - Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, - TestSpaceNamerThree, - Kokkos::Experimental::LogicalSpaceSharesAccess::no_shared_access>; - -TEST(defaultdevicetype, logical_space_views) { test_view_construct(); } -TEST(defaultdevicetype, logical_space_malloc) { test_malloc_free(); } -TEST(defaultdevicetype, logical_space_alloc) { test_space_allocations(); } -TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); } -TEST(defaultdevicetype, access_allowed) { - test_allowed_access(); -} -// FIXME_SYCL -#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) -TEST(defaultdevicetype_DeathTest, access_forbidden) { - ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { test_allowed_access(); }, - "Kokkos::View ERROR: attempt to access inaccessible memory space"); -} -#endif - -} // namespace Test From 6eb12dbc99c5930512454ca89dd44ab22710b290 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Nov 2023 07:57:46 -0400 Subject: [PATCH 104/432] Rollback changes to view constructors to reduce the number of instantiations (#6564) * Add test constructing empty runtime unmanaged view with 0 and NULL * Revert "Merge pull request #6536 from dalg24/view_constructor_from_label" This reverts commit 201d1dead123475d3521512b7eb891115c6ed72d, reversing changes made to 13efa71ac040ec431131ba250045265b38c65118. * Test View constructor from nullptr for const data type * Avoid clang-tidy warnings --------- Co-authored-by: Christian Trott --- core/src/Kokkos_View.hpp | 36 ++++++------ core/unit_test/CMakeLists.txt | 1 + .../TestViewEmptyRuntimeUnmanaged.hpp | 55 +++++++++++++++++++ 3 files changed, 75 insertions(+), 17 deletions(-) create mode 100644 core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index c953680dc78..bcbb28014cd 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1489,20 +1489,26 @@ class View : public ViewTraits { } // Allocate with label and layout - explicit inline View(std::string const& arg_label, - typename traits::array_layout const& arg_layout) + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, + typename traits::array_layout> const& arg_layout) : View(Impl::ViewCtorProp(arg_label), arg_layout) {} // Allocate label and layout, must disambiguate from subview constructor. - explicit inline View(std::string const& arg_label, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, const size_t> + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) : View(Impl::ViewCtorProp(arg_label), typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) { @@ -1559,10 +1565,8 @@ class View : public ViewTraits { arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); } - template >> explicit KOKKOS_INLINE_FUNCTION View( - PointerType arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -1578,10 +1582,8 @@ class View : public ViewTraits { "overload taking a layout object instead."); } - template >> explicit KOKKOS_INLINE_FUNCTION View( - PointerType arg_ptr, const typename traits::array_layout& arg_layout) + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} //---------------------------------------- diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 413c699be04..e280f4c7ef1 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -233,6 +233,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewCopy_a ViewCopy_b ViewCtorDimMatch + ViewEmptyRuntimeUnmanaged ViewHooks ViewLayoutStrideAssignment ViewMapping_a diff --git a/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp b/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp new file mode 100644 index 00000000000..b156b72860e --- /dev/null +++ b/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp @@ -0,0 +1,55 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +template +void test_empty_view_runtime_unmanaged() { + T d{}; + auto* p = reinterpret_cast(0xABADBABE); + + (void)Kokkos::View(p); + (void)Kokkos::View(&d); + (void)Kokkos::View(nullptr); + (void)Kokkos::View(NULL); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0); + (void)Kokkos::View(&d, 0); + (void)Kokkos::View(nullptr, 0); + (void)Kokkos::View(NULL, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0, 0); + (void)Kokkos::View(&d, 0, 0); + (void)Kokkos::View(nullptr, 0, 0); + (void)Kokkos::View(NULL, 0, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0, 0); // NOLINT(modernize-use-nullptr) +} + +TEST(TEST_CATEGORY, view_empty_runtime_unmanaged) { + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); +} + +} // namespace From a07c7a2b6c9999020859a7de86004e447790d0c4 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 Nov 2023 08:15:06 -0400 Subject: [PATCH 105/432] Address reviewer comments --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 09dd8875539..78e82df1005 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -97,11 +97,11 @@ __global__ void query_cuda_kernel_arch(int *d_arch) { } /** Query what compute capability is actually launched to the device: */ -int cuda_kernel_arch(int cuda_device) { +int cuda_kernel_arch(int device_id) { int arch = 0; int *d_arch = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); KOKKOS_IMPL_CUDA_SAFE_CALL( cudaMalloc(reinterpret_cast(&d_arch), sizeof(int))); KOKKOS_IMPL_CUDA_SAFE_CALL( @@ -390,8 +390,6 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { "Currently, the device id must match the device id used when Kokkos " "was initialized!"); - was_initialized = true; - //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. @@ -688,8 +686,12 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; + Impl::CudaInternal::m_cudaDev = cuda_device_id; Impl::CudaInternal::m_deviceProp = cudaProp; + Kokkos::Impl::cuda_device_synchronize( + "Kokkos::CudaInternal::initialize: Fence on space initialization"); + // Query what compute capability architecture a kernel executes: Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); From 6da3fa7e961905ba7b931443432f4da6fe7dfe85 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 1 Nov 2023 15:09:28 -0400 Subject: [PATCH 106/432] Threads remove unused variables and functions (#6566) * Remove wait_yields from the public interface * Remove unused get_thread_count functions * Remove unimplemented team_size_valid * Remove unused numa_rank and numa_core_rank * Remove useless include * Simplify code * fix indentation --- core/src/Threads/Kokkos_Threads_Instance.cpp | 81 +++++++------------- core/src/Threads/Kokkos_Threads_Instance.hpp | 12 --- 2 files changed, 26 insertions(+), 67 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 78c63abfa1c..2a60a54d893 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -21,13 +21,10 @@ #include -#include -#include #include #include #include #include -#include #include @@ -82,6 +79,12 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } +void wait_yield(volatile ThreadState &flag, const ThreadState value) { + while (value == flag) { + std::this_thread::yield(); + } +} + } // namespace } // namespace Impl } // namespace Kokkos @@ -100,13 +103,6 @@ bool ThreadsInternal::is_process() { //---------------------------------------------------------------------------- -void ThreadsInternal::wait_yield(volatile ThreadState &flag, - const ThreadState value) { - while (value == flag) { - std::this_thread::yield(); - } -} - void execute_function_noop(ThreadsInternal &, const void *) {} void ThreadsInternal::driver() { @@ -129,8 +125,6 @@ ThreadsInternal::ThreadsInternal() m_scratch(nullptr), m_scratch_reduce_end(0), m_scratch_thread_end(0), - m_numa_rank(0), - m_numa_core_rank(0), m_pool_rank(0), m_pool_size(0), m_pool_fan_size(0), @@ -150,17 +144,12 @@ ThreadsInternal::ThreadsInternal() // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && nil == atomic_compare_exchange(s_threads_exec + entry, nil, this)) { - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - m_numa_rank = coord.first; - m_numa_core_rank = coord.second; - m_pool_base = s_threads_exec; - m_pool_rank = s_thread_pool_size[0] - (entry + 1); - m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); - m_pool_size = s_thread_pool_size[0]; - m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); - m_pool_state = ThreadState::Active; + m_pool_base = s_threads_exec; + m_pool_rank = s_thread_pool_size[0] - (entry + 1); + m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); + m_pool_size = s_thread_pool_size[0]; + m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); + m_pool_state = ThreadState::Active; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); @@ -196,8 +185,6 @@ ThreadsInternal::~ThreadsInternal() { m_pool_base = nullptr; m_scratch_reduce_end = 0; m_scratch_thread_end = 0; - m_numa_rank = 0; - m_numa_core_rank = 0; m_pool_rank = 0; m_pool_size = 0; m_pool_fan_size = 0; @@ -213,8 +200,6 @@ ThreadsInternal::~ThreadsInternal() { } } -int ThreadsInternal::get_thread_count() { return s_thread_pool_size[0]; } - ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) { ThreadsInternal *const th = init_thread_rank < s_thread_pool_size[0] @@ -460,22 +445,17 @@ void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { fence(); - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = - Kokkos::hwloc::get_available_threads_per_core(); - - // Forestall compiler warnings for unused variables. - (void)numa_count; - (void)cores_per_numa; - (void)threads_per_core; - s << "Kokkos::Threads"; #if defined(KOKKOS_ENABLE_THREADS) s << " KOKKOS_ENABLE_THREADS"; #endif #if defined(KOKKOS_ENABLE_HWLOC) + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = + Kokkos::hwloc::get_available_threads_per_core(); + s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]"; #endif @@ -496,14 +476,12 @@ void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { if (th) { const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1); - s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "." - << th->m_numa_core_rank << " ]"; + s << " Thread[ " << th->m_pool_rank << " ]"; s << " Fan{"; for (int j = 0; j < th->m_pool_fan_size; ++j) { ThreadsInternal *const thfan = th->m_pool_base[rank_rev + (1 << j)]; - s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank - << "." << thfan->m_numa_core_rank << " ]"; + s << " [ " << thfan->m_pool_rank << " ]"; } s << " }"; @@ -616,13 +594,8 @@ void ThreadsInternal::initialize(int thread_count_arg) { Kokkos::hwloc::bind_this_thread(proc_coord); } - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - s_threads_exec[0] = &s_threads_process; - s_threads_process.m_numa_rank = coord.first; - s_threads_process.m_numa_core_rank = coord.second; - s_threads_process.m_pool_base = s_threads_exec; + s_threads_exec[0] = &s_threads_process; + s_threads_process.m_pool_base = s_threads_exec; s_threads_process.m_pool_rank = thread_count - 1; // Reversed for scan-compatible reductions s_threads_process.m_pool_size = thread_count; @@ -711,13 +684,11 @@ void ThreadsInternal::finalize() { s_thread_pool_size[2] = 0; // Reset master thread to run solo. - s_threads_process.m_numa_rank = 0; - s_threads_process.m_numa_core_rank = 0; - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 1; - s_threads_process.m_pool_fan_size = 0; - s_threads_process.m_pool_state = ThreadState::Inactive; + s_threads_process.m_pool_base = nullptr; + s_threads_process.m_pool_rank = 0; + s_threads_process.m_pool_size = 1; + s_threads_process.m_pool_fan_size = 0; + s_threads_process.m_pool_state = ThreadState::Inactive; Kokkos::Profiling::finalize(); } diff --git a/core/src/Threads/Kokkos_Threads_Instance.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp index ff010c1ccd0..074331bcaf1 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -56,8 +56,6 @@ class ThreadsInternal { void *m_scratch; int m_scratch_reduce_end; size_t m_scratch_thread_end; - int m_numa_rank; - int m_numa_core_rank; int m_pool_rank; int m_pool_rank_rev; int m_pool_size; @@ -89,11 +87,8 @@ class ThreadsInternal { public: KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; } KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; } - KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank; } - KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank; } inline long team_work_index() const { return m_team_work_index; } - static int get_thread_count(); static ThreadsInternal *get_thread(const int init_thread_rank); inline void *reduce_memory() const { return m_scratch; } @@ -125,15 +120,8 @@ class ThreadsInternal { static void finalize(); - /* Given a requested team size, return valid team size */ - static unsigned team_size_valid(unsigned); - static void print_configuration(std::ostream &, const bool detail = false); - //------------------------------------ - - static void wait_yield(volatile ThreadState &, const ThreadState); - //------------------------------------ // All-thread functions: From 024d6c21b809f1e5187a2866135c856232dc7ba4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 1 Nov 2023 15:14:31 -0400 Subject: [PATCH 107/432] Remove unused Sandia testing files (#6568) * [ci skip] Remove (unused) config/ directory * [ci skip] Remove mention to config/yaml/ directory in Spack doc Co-Authored-By: Bruno Turcksin --------- Co-authored-by: Bruno Turcksin --- Spack.md | 1 - config/test_all_sandia | 773 ----------------------------------------- config/yaml/volta.yaml | 4 - 3 files changed, 778 deletions(-) delete mode 100755 config/test_all_sandia delete mode 100644 config/yaml/volta.yaml diff --git a/Spack.md b/Spack.md index 79606c259d5..06c763a64ee 100644 --- a/Spack.md +++ b/Spack.md @@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you > spack install superscience ```` you may end up just getting the default Kokkos (i.e. Serial). -Some examples are included in the `config/yaml` folder for common platforms. Before running `spack install ` we recommend running `spack spec ` to confirm your dependency tree is correct. For example, with Kokkos Kernels: ````bash diff --git a/config/test_all_sandia b/config/test_all_sandia deleted file mode 100755 index 193a162a4e6..00000000000 --- a/config/test_all_sandia +++ /dev/null @@ -1,773 +0,0 @@ -#!/bin/bash -e - -# -# Global config -# - -set -o pipefail - -# Determine current machine. - -MACHINE="" -HOSTNAME=$(hostname) -PROCESSOR=`uname -p` - -if [[ "$HOSTNAME" =~ (white|ride).* ]]; then - MACHINE=white - module load git -fi - -if [[ "$HOSTNAME" =~ .*bowman.* ]]; then - MACHINE=bowman - module load git -fi - -if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name - if [[ "$PROCESSOR" = "aarch64" ]]; then - MACHINE=sullivan - module load git - fi -fi - -if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name - if [[ "$MACHINE" = "" ]]; then - MACHINE=shepard - module load git - fi -fi - -if [[ "$HOSTNAME" == apollo\.* ]]; then - MACHINE=apollo - module load git -fi - -if [[ "$HOSTNAME" == sullivan ]]; then - MACHINE=sullivan - module load git -fi - -if [[ "$HOSTNAME" == mayer\.* ]]; then - MACHINE=mayer -# module load git -fi -if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name - MACHINE=mayer -fi - -if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then - if [[ "$MACHINE" = "" ]]; then - MACHINE=sems - module load sems-git - fi -fi - -if [[ "$MACHINE" = "" ]]; then - echo "Unrecognized machine" >&2 - exit 1 -fi - -echo "Running on machine: $MACHINE" - -GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" -CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" -CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" - -GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" -IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" -PGI_WARNING_FLAGS="" - -# Default. Machine specific can override. -DEBUG=False -ARGS="" -CUSTOM_BUILD_LIST="" -DRYRUN=False -BUILD_ONLY=False -declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 -TEST_SCRIPT=False -SKIP_HWLOC=False -SPOT_CHECK=False - -PRINT_HELP=False -OPT_FLAG="" -CXX_FLAGS_EXTRA="" -LD_FLAGS_EXTRA="" -KOKKOS_OPTIONS="" - -# -# Handle arguments. -# - -while [[ $# > 0 ]] -do - key="$1" - - case $key in - --kokkos-path*) - KOKKOS_PATH="${key#*=}" - ;; - --build-list*) - CUSTOM_BUILD_LIST="${key#*=}" - ;; - --debug*) - DEBUG=True - ;; - --build-only*) - BUILD_ONLY=True - ;; - --test-script*) - TEST_SCRIPT=True - ;; - --skip-hwloc*) - SKIP_HWLOC=True - ;; - --num*) - NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" - ;; - --dry-run*) - DRYRUN=True - ;; - --spot-check*) - SPOT_CHECK=True - ;; - --arch*) - ARCH_FLAG="--arch=${key#*=}" - ;; - --opt-flag*) - OPT_FLAG="${key#*=}" - ;; - --with-cuda-options*) - KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" - ;; - --with-options*) - KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}" - ;; - --cxxflags-extra*) - CXX_FLAGS_EXTRA="${key#*=}" - ;; - --ldflags-extra*) - LD_FLAGS_EXTRA="${key#*=}" - ;; - --help*) - PRINT_HELP=True - ;; - *) - # args, just append - ARGS="$ARGS $1" - ;; - esac - - shift -done - -SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) - -# Set kokkos path. -if [ -z "$KOKKOS_PATH" ]; then - KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT -else - # Ensure KOKKOS_PATH is abs path. - KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) -fi - -UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` -if ! [ -z "$UNCOMMITTED" ]; then - echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" - echo "$UNCOMMITTED" - echo "" -fi - -GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` -echo "Repository Status: " ${GITSTATUS} -echo "" -echo "" - -# -# Machine specific config. -# - -if [ "$MACHINE" = "sems" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - - BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-/" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="" - fi - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - fi -elif [ "$MACHINE" = "white" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - IBM_MODULE_LIST="/xl/" - CUDA_MODULE_LIST="/,gcc/6.4.0,ibm/xl/16.1.0" - - # Don't do pthread on white. - GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=Power8,Kepler37" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "bowman" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "sullivan" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-ThunderX" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "mayer" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - ARM_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-TX2" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "shepard" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - BASE_MODULE_LIST_INTEL="/compilers/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=HSW" - fi - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "apollo" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - module use /home/projects/modulefiles/local/x86-64 - module load kokkos-env - - module load sems-git - module load sems-tex - module load sems-cmake/3.5.2 - module load sems-gdb - - SKIP_HWLOC=True - - BASE_MODULE_LIST="sems-env,kokkos-env,sems-/,kokkos-hwloc/1.10.1/base" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,sems-gcc/5.3.0" - - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" - BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" - BUILD_LIST_CLANG="Serial,Pthread,OpenMP" - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" - "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - ) - fi - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SNB,Volta70" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -else - echo "Unhandled machine $MACHINE" >&2 - exit 1 -fi - -export OMP_NUM_THREADS=4 - -declare -i NUM_RESULTS_TO_KEEP=7 - -RESULT_ROOT_PREFIX=TestAll - -if [ "$PRINT_HELP" = "True" ]; then - echo "test_all_sandia :" - echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" - echo " Defaults to root repo containing this script" - echo "--debug: Run tests in debug. Defaults to False" - echo "--test-script: Test this script, not Kokkos" - echo "--skip-hwloc: Do not do hwloc tests" - echo "--num=N: Number of jobs to run in parallel" - echo "--spot-check: Minimal test set to issue pull request" - echo "--dry-run: Just print what would be executed" - echo "--build-only: Just do builds, don't run anything" - echo "--opt-flag=FLAG: Optimization flag (default: -O3)" - echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" - echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" - echo "--arch=ARCHITECTURE: overwrite architecture flags" - echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" - echo "--build-list=BUILD,BUILD,BUILD..." - echo " Provide a comma-separated list of builds instead of running all builds" - echo " Valid items:" - echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" - echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" - echo "" - - echo "ARGS: list of expressions matching compilers to test" - echo " supported compilers sems" - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - echo " $COMPILER" - done - echo "" - - echo "Examples:" - echo " Run all tests" - echo " % test_all_sandia" - echo "" - echo " Run all gcc tests" - echo " % test_all_sandia gcc" - echo "" - echo " Run all gcc/4.8.4 and all intel tests" - echo " % test_all_sandia gcc/4.8.4 intel" - echo "" - echo " Run all tests in debug" - echo " % test_all_sandia --debug" - echo "" - echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" - echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" - echo "" - echo "If you want to kill the tests, do:" - echo " hit ctrl-z" - echo " % kill -9 %1" - echo - exit 0 -fi - -# Set build type. -if [ "$DEBUG" = "True" ]; then - BUILD_TYPE=debug -else - BUILD_TYPE=release -fi - -# If no args provided, do all compilers. -if [ -z "$ARGS" ]; then - ARGS='?' -fi - -# Process args to figure out which compilers to test. -COMPILERS_TO_TEST="" - -for ARG in $ARGS; do - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - - if [[ "$COMPILER" = $ARG* ]]; then - if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then - COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" - else - echo "Tried to add $COMPILER twice" - fi - fi - done -done - -# -# Functions. -# - -# get_compiler_name -get_compiler_name() { - echo $1 | cut -d/ -f1 -} - -# get_compiler_version -get_compiler_version() { - echo $1 | cut -d/ -f2 -} - -# Do not call directly. -get_compiler_data() { - local compiler=$1 - local item=$2 - local compiler_name=$(get_compiler_name $compiler) - local compiler_vers=$(get_compiler_version $compiler) - - local compiler_data - for compiler_data in "${COMPILERS[@]}" ; do - local arr=($compiler_data) - - if [ "$compiler" = "${arr[0]}" ]; then - echo "${arr[$item]}" | tr , ' ' | sed -e "s//$compiler_name/g" -e "s//$compiler_vers/g" - return 0 - fi - done - - # Not found. - echo "Unreconized compiler $compiler" >&2 - exit 1 -} - -# -# For all getters, usage: -# - -get_compiler_modules() { - get_compiler_data $1 1 -} - -get_compiler_build_list() { - get_compiler_data $1 2 -} - -get_compiler_exe_name() { - get_compiler_data $1 3 -} - -get_compiler_warning_flags() { - get_compiler_data $1 4 -} - -run_cmd() { - echo "RUNNING: $*" - if [ "$DRYRUN" != "True" ]; then - eval "$* 2>&1" - fi -} - -# report_and_log_test_results -report_and_log_test_result() { - # Use sane var names. - local success=$1; local desc=$2; local comment=$3; - - if [ "$success" = "0" ]; then - echo " PASSED $desc" - echo $comment > $PASSED_DIR/$desc - else - # For failures, comment should be the name of the phase that failed. - echo " FAILED $desc" >&2 - echo $comment > $FAILED_DIR/$desc - cat ${desc}.${comment}.log - fi -} - -setup_env() { - local compiler=$1 - local compiler_modules=$(get_compiler_modules $compiler) - - module purge - - local mod - for mod in $compiler_modules; do - echo "Loading module $mod" - module load $mod 2>&1 - # It is ridiculously hard to check for the success of a loaded - # module. Module does not return error codes and piping to grep - # causes module to run in a subshell. - module list 2>&1 | grep "$mod" >& /dev/null || return 1 - done - - return 0 -} - -# single_build_and_test -single_build_and_test() { - # Use sane var names. - local compiler=$1; local build=$2; local build_type=$3; - - # Set up env. - mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" - cd $ROOT_DIR/$compiler/"${build}-$build_type" - local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') - setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - - # Set up flags. - local compiler_warning_flags=$(get_compiler_warning_flags $compiler) - local compiler_exe=$(get_compiler_exe_name $compiler) - - if [[ "$build_type" = hwloc* ]]; then - local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) - fi - - if [[ "$OPT_FLAG" = "" ]]; then - OPT_FLAG="-O3" - fi - - if [[ "$build_type" = *debug* ]]; then - local extra_args="$extra_args --debug" - local cxxflags="-g $compiler_warning_flags" - local ldflags="-g" - else - local cxxflags="$OPT_FLAG $compiler_warning_flags" - local ldflags="${OPT_FLAG}" - fi - - local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" - local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" - - if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" - fi - if [[ "$KOKKOS_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_OPTIONS" - else - local extra_args="$extra_args --with-options=enable_large_mem_tests" - fi - - echo " Starting job $desc" - - local comment="no_comment" - - if [ "$TEST_SCRIPT" = "True" ]; then - local rand=$[ 1 + $[ RANDOM % 10 ]] - sleep $rand - - if [ $rand -gt 5 ]; then - run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } - fi - else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - local -i build_start_time=$(date +%s) - run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } - local -i build_end_time=$(date +%s) - comment="build_time=$(($build_end_time-$build_start_time))" - - if [[ "$BUILD_ONLY" == False ]]; then - run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } - local -i run_end_time=$(date +%s) - comment="$comment run_time=$(($run_end_time-$build_end_time))" - fi - fi - - report_and_log_test_result 0 $desc "$comment" - - return 0 -} - -# wait_for_jobs -wait_for_jobs() { - local -i max_jobs=$1 - local -i num_active_jobs=$(jobs | wc -l) - while [ $num_active_jobs -ge $max_jobs ] - do - sleep 1 - num_active_jobs=$(jobs | wc -l) - jobs >& /dev/null - done -} - -# run_in_background -run_in_background() { - local compiler=$1 - - local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL - # Don't override command line input. - # if [[ "$BUILD_ONLY" == True ]]; then - # num_jobs=8 - # else - if [[ "$compiler" == cuda* ]]; then - num_jobs=1 - fi - if [[ "$compiler" == clang ]]; then - num_jobs=1 - fi - # fi - wait_for_jobs $num_jobs - - single_build_and_test $* & -} - -# build_and_test_all -build_and_test_all() { - # Get compiler data. - local compiler=$1 - if [ -z "$CUSTOM_BUILD_LIST" ]; then - local compiler_build_list=$(get_compiler_build_list $compiler) - else - local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') - fi - - # Do builds. - local build - for build in $compiler_build_list - do - run_in_background $compiler $build $BUILD_TYPE - - # If not cuda, do a hwloc test too. - if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then - run_in_background $compiler $build "hwloc-$BUILD_TYPE" - fi - done - - return 0 -} - -get_test_root_dir() { - local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) - local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) - local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} - - if [ $num_to_delete -gt 0 ]; then - /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) - fi - - echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") -} - -wait_summarize_and_exit() { - wait_for_jobs 1 - - echo "#######################################################" - echo "PASSED TESTS" - echo "#######################################################" - - local passed_test - for passed_test in $(\ls -1 $PASSED_DIR | sort) - do - echo $passed_test $(cat $PASSED_DIR/$passed_test) - done - - local -i rv=0 - if [ "$(ls -A $FAILED_DIR)" ]; then - echo "#######################################################" - echo "FAILED TESTS" - echo "#######################################################" - - local failed_test - for failed_test in $(\ls -1 $FAILED_DIR | sort) - do - echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" - rv=$rv+1 - done - fi - - exit $rv -} - -# -# Main. -# - -ROOT_DIR=$(get_test_root_dir) -mkdir -p $ROOT_DIR -cd $ROOT_DIR - -PASSED_DIR=$ROOT_DIR/results/passed -FAILED_DIR=$ROOT_DIR/results/failed -mkdir -p $PASSED_DIR -mkdir -p $FAILED_DIR - -echo "Going to test compilers: " $COMPILERS_TO_TEST -for COMPILER in $COMPILERS_TO_TEST; do - echo "Testing compiler $COMPILER" - build_and_test_all $COMPILER -done - -wait_summarize_and_exit diff --git a/config/yaml/volta.yaml b/config/yaml/volta.yaml deleted file mode 100644 index f67af9c2a44..00000000000 --- a/config/yaml/volta.yaml +++ /dev/null @@ -1,4 +0,0 @@ -packages: - kokkos: - variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1 - compiler: [gcc@7.2.0] From b76e1dcc152632d4485c4069a37777e602acf83d Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Tue, 24 Oct 2023 17:49:34 -0600 Subject: [PATCH 108/432] fallback implementation cleanup --- simd/src/Kokkos_SIMD_AVX2.hpp | 40 ++++++++++++++++++++++++------- simd/src/Kokkos_SIMD_Common.hpp | 42 --------------------------------- 2 files changed, 31 insertions(+), 51 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 521160b76fc..5f792751303 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -1111,6 +1111,11 @@ class simd> { return simd( _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { @@ -1278,6 +1283,13 @@ class simd> { _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit signed integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + // AVX2 only has eq and gt comparisons for int64 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { @@ -1306,17 +1318,19 @@ class simd> { return !(lhs == rhs); } + // fallback simd shift right arithmetic using generator constructor // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, int rhs) noexcept { - // return simd(_mm256_srai_epi64(static_cast<__m256i>(lhs), rhs)); - // } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs; }); + } - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, simd const& rhs) noexcept { - // return simd(_mm256_srav_epi64(static_cast<__m256i>(lhs), - // static_cast<__m256i>(rhs)))); - // } + // fallback simd shift right arithmetic using generator constructor + // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, int rhs) noexcept { @@ -1460,6 +1474,14 @@ class simd> { return simd( _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit unsigned integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { return _mm256_srli_epi64(static_cast<__m256i>(lhs), rhs); diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index 87edf994533..cc9397c4fc6 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -117,48 +117,6 @@ template return const_where_expression(mask, value); } -// fallback simd multiplication using generator constructor -// At the time of this writing, this fallback is only used -// to multiply vectors of 64-bit signed integers for the AVX2 backend - -template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator*( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); -} - -// fallback simd shift using generator constructor -// At the time of this edit, only the fallback for shift vectors of -// 64-bit signed integers for the AVX2 backend is used - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs[i]; }); -} - // The code below provides: // operator@(simd, Arithmetic) // operator@(Arithmetic, simd) From 3b8c449f17d058d79733fb721d2a1d09a247cae0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 Nov 2023 13:30:36 -0400 Subject: [PATCH 109/432] Remove empty quotation marks for static_assert --- .../unit_tests/TestStdAlgorithmsModOps.cpp | 4 +- .../TestStdAlgorithmsPartitionCopy.cpp | 6 +- containers/src/Kokkos_DynRankView.hpp | 2 +- core/src/Cuda/Kokkos_CudaSpace.hpp | 11 +- core/src/HIP/Kokkos_HIP_Space.hpp | 3 +- core/src/KokkosExp_MDRangePolicy.hpp | 4 +- core/src/Kokkos_HBWSpace.hpp | 7 +- core/src/Kokkos_HostSpace.hpp | 3 +- core/src/Kokkos_MathematicalFunctions.hpp | 2 +- core/src/Kokkos_View.hpp | 10 +- core/src/SYCL/Kokkos_SYCL_Space.hpp | 15 +- core/src/impl/Kokkos_ViewArray.hpp | 7 +- core/src/impl/Kokkos_ViewMapping.hpp | 28 +- core/src/traits/Kokkos_IndexTypeTrait.hpp | 2 +- .../traits/Kokkos_OccupancyControlTrait.hpp | 2 +- core/src/traits/Kokkos_PolicyTraitAdaptor.hpp | 4 +- core/src/traits/Kokkos_ScheduleTrait.hpp | 2 +- .../traits/Kokkos_WorkItemPropertyTrait.hpp | 2 +- core/unit_test/TestAggregate.hpp | 30 +- core/unit_test/TestComplex.hpp | 20 +- core/unit_test/TestConcepts.hpp | 68 ++--- core/unit_test/TestFunctorAnalysis.hpp | 58 ++-- .../TestHostSharedPtrAccessOnDevice.hpp | 2 +- .../TestJoinBackwardCompatibility.hpp | 5 +- core/unit_test/TestMathematicalFunctions.hpp | 52 ++-- core/unit_test/TestNumericTraits.hpp | 101 +++---- core/unit_test/TestTeamBasic.hpp | 2 +- core/unit_test/TestUtilities.hpp | 18 +- core/unit_test/TestViewAPI.hpp | 3 +- core/unit_test/TestViewMapping_a.hpp | 277 ++++++++---------- core/unit_test/TestViewMapping_b.hpp | 14 +- core/unit_test/cuda/TestCuda_Spaces.cpp | 166 +++++------ .../default/TestDefaultDeviceType.cpp | 9 +- core/unit_test/hip/TestHIP_Spaces.cpp | 150 ++++------ core/unit_test/sycl/TestSYCL_Spaces.cpp | 195 +++++------- core/unit_test/tools/TestProfilingSection.cpp | 10 +- 36 files changed, 555 insertions(+), 739 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 4604764097e..c0130885dc5 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); // move constr MyMovableType b(std::move(a)); @@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove { void operator()(const int index) const { typename ViewType::value_type a{11}; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); m_view(index) = std::move(a); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index f169fd9ce88..a36c9db2b9e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); const std::size_t ext = view_from.extent(0); diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 52aa86d8ee4..33d9562ea4f 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1340,7 +1340,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits& cuda_get_deep_copy_space( bool initialize = true); static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); //---------------------------------------- diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index f3e5adf87e5..df03785b112 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -239,8 +239,7 @@ struct Impl::is_hip_type_space : public std::true_type {}; namespace Kokkos { namespace Impl { -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); //---------------------------------------- diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp index c9080db01ca..d0ae7fdcea5 100644 --- a/core/src/KokkosExp_MDRangePolicy.hpp +++ b/core/src/KokkosExp_MDRangePolicy.hpp @@ -96,7 +96,7 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { using T = typename Array::value_type; Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); auto* ptr = a.data(); // NOTE equivalent to // std::transform(std::begin(init), std::end(init), a.data(), @@ -120,7 +120,7 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; NVCC_WONT_LET_ME_CALL_YOU_Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); for (std::size_t i = 0; i < M; ++i) { a[i] = checked_narrow_cast(other[i]); (void)checked_narrow_cast(other[i]); // see note above diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp index 369b7bafb7b..56fe607b349 100644 --- a/core/src/Kokkos_HBWSpace.hpp +++ b/core/src/Kokkos_HBWSpace.hpp @@ -188,10 +188,9 @@ namespace Kokkos { namespace Impl { -static_assert( - Kokkos::Impl::MemorySpaceAccess::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HBWSpace, + Kokkos::Experimental::HBWSpace>::assignable); template <> struct MemorySpaceAccess { diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 90d14040637..c20bb1abc60 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -129,8 +129,7 @@ namespace Kokkos { namespace Impl { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); template struct HostMirror { diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index 361d1317e94..3fead8dd293 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -198,7 +198,7 @@ using promote_3_t = typename promote_3::type; long double> \ FUNC(T1 x, T2 y) { \ using Promoted = Kokkos::Impl::promote_2_t; \ - static_assert(std::is_same_v, ""); \ + static_assert(std::is_same_v); \ using std::FUNC; \ return FUNC(static_cast(x), static_cast(y)); \ } diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index bcbb28014cd..1d2b4b9be2c 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -814,15 +814,15 @@ class View : public ViewTraits { template static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is), ""); - static_assert(sizeof...(Is) <= 8, ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); } template static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is), ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); } public: diff --git a/core/src/SYCL/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp index 239c6e3ce0b..252391b2f2c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -166,19 +166,16 @@ struct is_sycl_type_space : public std::true_type {}; static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); template <> struct MemorySpaceAccess> { private: using array_analysis = ViewArrayAnalysis; - static_assert(std::is_void

::value, ""); + static_assert(std::is_void

::value); static_assert(std::is_same>::value, - ""); + Kokkos::Array>::value); static_assert(std::is_scalar::value, "View of Array type must be of a scalar type"); @@ -507,7 +506,7 @@ class ViewMapping< Kokkos::LayoutStride>::value))>, SrcTraits, Args...> { private: - static_assert(SrcTraits::rank == sizeof...(Args), ""); + static_assert(SrcTraits::rank == sizeof...(Args)); enum : bool { R0 = is_integral_extent<0, Args...>::value, diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 01d0dc4f681..16ca33a87d0 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -657,21 +657,20 @@ struct SubviewExtents { template KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension& dim, Args... args) { - static_assert(DomainRank == sizeof...(DimArgs), ""); - static_assert(DomainRank == sizeof...(Args), ""); + static_assert(DomainRank == sizeof...(DimArgs)); + static_assert(DomainRank == sizeof...(Args)); // Verifies that all arguments, up to 8, are integral types, // integral extents, or don't exist. - static_assert( - RangeRank == unsigned(is_integral_extent<0, Args...>::value) + - unsigned(is_integral_extent<1, Args...>::value) + - unsigned(is_integral_extent<2, Args...>::value) + - unsigned(is_integral_extent<3, Args...>::value) + - unsigned(is_integral_extent<4, Args...>::value) + - unsigned(is_integral_extent<5, Args...>::value) + - unsigned(is_integral_extent<6, Args...>::value) + - unsigned(is_integral_extent<7, Args...>::value), - ""); + static_assert(RangeRank == + unsigned(is_integral_extent<0, Args...>::value) + + unsigned(is_integral_extent<1, Args...>::value) + + unsigned(is_integral_extent<2, Args...>::value) + + unsigned(is_integral_extent<3, Args...>::value) + + unsigned(is_integral_extent<4, Args...>::value) + + unsigned(is_integral_extent<5, Args...>::value) + + unsigned(is_integral_extent<6, Args...>::value) + + unsigned(is_integral_extent<7, Args...>::value)); if (RangeRank == 0) { m_length[0] = 0; @@ -814,8 +813,7 @@ struct ViewDataAnalysis { // Must match array analysis when this default template is used. static_assert( std::is_same::value, - ""); + typename array_analysis::non_const_value_type>::value); public: using specialize = void; // No specialization @@ -3896,7 +3894,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits::value, ""); + static_assert(std::is_integral::value); static constexpr bool index_type_is_defaulted = false; using index_type = Kokkos::IndexType; }; diff --git a/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/core/src/traits/Kokkos_OccupancyControlTrait.hpp index dadf582c372..c2ca5a341f1 100644 --- a/core/src/traits/Kokkos_OccupancyControlTrait.hpp +++ b/core/src/traits/Kokkos_OccupancyControlTrait.hpp @@ -163,7 +163,7 @@ auto prefer(Policy const& p, DesiredOccupancy occ) { template constexpr auto prefer(Policy const& p, MaximizeOccupancy) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::OccupancyControlTrait::policy_with_trait; diff --git a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp index 578e9e762ad..98ad1d7ebbb 100644 --- a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp +++ b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp @@ -68,7 +68,7 @@ struct PolicyTraitAdaptorImpl< TraitSpec, PolicyTemplate, type_list, type_list, NewTrait, std::enable_if_t::value>> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; @@ -92,7 +92,7 @@ template class PolicyTemplate, struct PolicyTraitAdaptorImpl, type_list<>, NewTrait> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; diff --git a/core/src/traits/Kokkos_ScheduleTrait.hpp b/core/src/traits/Kokkos_ScheduleTrait.hpp index 86130025530..4e91d89f0f9 100644 --- a/core/src/traits/Kokkos_ScheduleTrait.hpp +++ b/core/src/traits/Kokkos_ScheduleTrait.hpp @@ -78,7 +78,7 @@ namespace Experimental { template constexpr auto require(Policy const& p, Kokkos::Schedule) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait< Policy, Kokkos::Schedule>; return new_policy_t{p}; diff --git a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp index 8f95385c851..ae7aa6e534f 100644 --- a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp +++ b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp @@ -57,7 +57,7 @@ namespace Experimental { template constexpr auto require(const Policy p, WorkItemProperty::ImplWorkItemProperty) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait< Policy, WorkItemProperty::ImplWorkItemProperty>; return new_policy_t{p}; diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp index 4f67b2eddce..f1316a7426a 100644 --- a/core/unit_test/TestAggregate.hpp +++ b/core/unit_test/TestAggregate.hpp @@ -29,35 +29,31 @@ void TestViewAggregate() { value_type>; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); using a32_traits = Kokkos::ViewTraits; using flat_traits = Kokkos::ViewTraits; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); - static_assert(a32_traits::rank == 2, ""); - static_assert(a32_traits::rank_dynamic == 2, ""); + std::is_same::value); + static_assert(a32_traits::rank == 2); + static_assert(a32_traits::rank_dynamic == 2); - static_assert(std::is_void::value, ""); - static_assert(flat_traits::rank == 3, ""); - static_assert(flat_traits::rank_dynamic == 2, ""); - static_assert(flat_traits::dimension::N2 == 32, ""); + static_assert(std::is_void::value); + static_assert(flat_traits::rank == 3); + static_assert(flat_traits::rank_dynamic == 2); + static_assert(flat_traits::dimension::N2 == 32); using a32_type = Kokkos::View **, DeviceType>; using a32_flat_type = typename a32_type::array_type; - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(a32_type::rank == 2, ""); - static_assert(a32_flat_type::rank == 3, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(a32_type::rank == 2); + static_assert(a32_flat_type::rank == 3); a32_type x("test", 4, 5); a32_flat_type y(x); diff --git a/core/unit_test/TestComplex.hpp b/core/unit_test/TestComplex.hpp index bcae2e1d816..5501a35b7f0 100644 --- a/core/unit_test/TestComplex.hpp +++ b/core/unit_test/TestComplex.hpp @@ -451,17 +451,15 @@ TEST(TEST_CATEGORY, complex_issue_3867) { ASSERT_FLOAT_EQ(x.real(), y.real()); ASSERT_FLOAT_EQ(x.imag(), y.imag()); -#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); +#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex, long double, Kokkos::complex); diff --git a/core/unit_test/TestConcepts.hpp b/core/unit_test/TestConcepts.hpp index 476a8848325..b85867bf63a 100644 --- a/core/unit_test/TestConcepts.hpp +++ b/core/unit_test/TestConcepts.hpp @@ -22,42 +22,42 @@ using ExecutionSpace = TEST_EXECSPACE; using MemorySpace = typename ExecutionSpace::memory_space; using DeviceType = typename ExecutionSpace::device_type; -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); - -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); - -static_assert(Kokkos::is_device{}, ""); -static_assert(Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); - -static_assert(Kokkos::is_execution_space_v, ""); -static_assert(!Kokkos::is_execution_space_v, ""); +static_assert(Kokkos::is_execution_space{}); +static_assert(Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); + +static_assert(Kokkos::is_memory_space{}); +static_assert(Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); + +static_assert(Kokkos::is_device{}); +static_assert(Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); + +static_assert(Kokkos::is_execution_space_v); +static_assert(!Kokkos::is_execution_space_v); static_assert( - std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); + std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); /*------------------------------------------------- begin test for team_handle concept diff --git a/core/unit_test/TestFunctorAnalysis.hpp b/core/unit_test/TestFunctorAnalysis.hpp index c024526111b..e58324144e4 100644 --- a/core/unit_test/TestFunctorAnalysis.hpp +++ b/core/unit_test/TestFunctorAnalysis.hpp @@ -59,16 +59,15 @@ void test_functor_analysis() { using R01 = typename A01::Reducer; - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_same::value, - ""); - - static_assert(!A01::has_join_member_function, ""); - static_assert(!A01::has_init_member_function, ""); - static_assert(!A01::has_final_member_function, ""); - static_assert(A01::StaticValueSize == 0, ""); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_same::value); + + static_assert(!A01::has_join_member_function); + static_assert(!A01::has_init_member_function); + static_assert(!A01::has_final_member_function); + static_assert(A01::StaticValueSize == 0); ASSERT_EQ(R01(c01).length(), 0); //------------------------------ @@ -78,16 +77,15 @@ void test_functor_analysis() { Kokkos::RangePolicy, decltype(c02), void>; using R02 = typename A02::Reducer; - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); - static_assert(!A02::has_join_member_function, ""); - static_assert(!A02::has_init_member_function, ""); - static_assert(!A02::has_final_member_function, ""); - static_assert(A02::StaticValueSize == sizeof(double), ""); + static_assert(!A02::has_join_member_function); + static_assert(!A02::has_init_member_function); + static_assert(!A02::has_final_member_function); + static_assert(A02::StaticValueSize == sizeof(double)); ASSERT_EQ(R02(c02).length(), 1); //------------------------------ @@ -99,23 +97,19 @@ void test_functor_analysis() { using R03 = typename A03::Reducer; static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type*>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type&>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); - static_assert(A03::has_join_member_function, ""); - static_assert(A03::has_init_member_function, ""); - static_assert(!A03::has_final_member_function, ""); - static_assert( - A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); + static_assert(A03::has_join_member_function); + static_assert(A03::has_init_member_function); + static_assert(!A03::has_final_member_function); + static_assert(A03::StaticValueSize == + sizeof(TestFunctorAnalysis_03::value_type)); ASSERT_EQ(R03(c03).length(), 1); //------------------------------ diff --git a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp index 3ee2ff52051..467b9ad157f 100644 --- a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp +++ b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp @@ -37,7 +37,7 @@ template struct CheckAccessStoredPointerAndDereferenceOnDevice { SmartPtr m_device_ptr; using ElementType = typename SmartPtr::element_type; - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) { diff --git a/core/unit_test/TestJoinBackwardCompatibility.hpp b/core/unit_test/TestJoinBackwardCompatibility.hpp index 24cf52aa709..efe4a2307a8 100644 --- a/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -36,9 +36,8 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, } static_assert((no_error | error_operator_plus_equal_volatile) == - error_operator_plus_equal_volatile, - ""); -static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, ""); + error_operator_plus_equal_volatile); +static_assert((error_join_volatile | error_operator_plus_equal) == 0b101); struct MyJoinBackCompatValueType { MyErrorCode err = no_error; diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index d7390172b68..4d203ead75f 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -1315,19 +1315,17 @@ struct TestAbsoluteValueFunction { Kokkos::printf("failed abs(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); static_assert(std::is_same(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; @@ -1451,17 +1449,14 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_assert(std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); #endif } }; @@ -1530,19 +1525,16 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_assert( std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert( std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS static_assert( - std::is_same::value, ""); + std::is_same::value); #endif } }; @@ -1622,11 +1614,11 @@ struct TestIsNaN { Kokkos::printf("failed isnan(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index aa11f21dd1f..3c159ebb341 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -407,7 +407,7 @@ struct HasNoSpecialization {}; using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT::value); \ template \ using has_##TRAIT = Kokkos::is_detected; \ - static_assert(!has_##TRAIT::value, ""); + static_assert(!has_##TRAIT::value); CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity) CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min) @@ -489,39 +489,39 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif // clang-format off -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min(), ""); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min()); // integer types -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // floating point types -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // clang-format on CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits); @@ -588,15 +588,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10); #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT -#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ - static_assert( \ - std::numeric_limits::TRAIT() != std::numeric_limits::TRAIT(), ""); \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - std::numeric_limits::TRAIT(), \ - "") +#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + Kokkos::Experimental::TRAIT::value); \ + static_assert(std::numeric_limits::TRAIT() != \ + std::numeric_limits::TRAIT()); \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + std::numeric_limits::TRAIT()) // Workaround compiler issue error: expression must have a constant value // See kokkos/kokkos#4574 @@ -616,14 +614,11 @@ CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, signaling_NaN); #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ @@ -671,17 +666,13 @@ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(max_exponent10); #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ diff --git a/core/unit_test/TestTeamBasic.hpp b/core/unit_test/TestTeamBasic.hpp index c395bc0837c..a3d84c5e16b 100644 --- a/core/unit_test/TestTeamBasic.hpp +++ b/core/unit_test/TestTeamBasic.hpp @@ -280,7 +280,7 @@ namespace Test { // Test for non-arithmetic type TEST(TEST_CATEGORY, team_broadcast_long_wrapper) { - static_assert(!std::is_arithmetic::value, ""); + static_assert(!std::is_arithmetic::value); TestTeamBroadcast, long_wrapper>::test_teambroadcast(0, 1); diff --git a/core/unit_test/TestUtilities.hpp b/core/unit_test/TestUtilities.hpp index b1f9d30c1fc..ad5a0df92de 100644 --- a/core/unit_test/TestUtilities.hpp +++ b/core/unit_test/TestUtilities.hpp @@ -25,20 +25,18 @@ namespace Test { void test_is_specialization_of() { using Kokkos::Impl::is_specialization_of; - static_assert(is_specialization_of, Kokkos::pair>{}, - ""); - static_assert(!is_specialization_of, Kokkos::pair>{}, ""); - static_assert(is_specialization_of, Kokkos::View>{}, ""); + static_assert(is_specialization_of, Kokkos::pair>{}); + static_assert(!is_specialization_of, Kokkos::pair>{}); + static_assert(is_specialization_of, Kokkos::View>{}); // NOTE Not removing cv-qualifiers - static_assert(!is_specialization_of const, Kokkos::View>{}, - ""); + static_assert( + !is_specialization_of const, Kokkos::View>{}); // NOTE Would not compile because Kokkos::Array takes a non-type template // parameter - // static_assert(is_specialization_of, Kokkos::Array>{}, - // ""); + // static_assert(is_specialization_of, + // Kokkos::Array>{}); // But this is fine of course - static_assert(!is_specialization_of, Kokkos::pair>{}, - ""); + static_assert(!is_specialization_of, Kokkos::pair>{}); } namespace { diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp index ffc500e4a9a..4c27695f6d1 100644 --- a/core/unit_test/TestViewAPI.hpp +++ b/core/unit_test/TestViewAPI.hpp @@ -958,8 +958,7 @@ class TestViewAPI { using mirror_type = typename view_type::HostMirror; static_assert(std::is_same::value, - ""); + typename mirror_type::memory_space>::value); view_type a("a"); mirror_type am = Kokkos::create_mirror_view(a); diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index 9173f0d4316..a4dfdb26e3f 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -73,67 +73,67 @@ void test_view_mapping() { ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); #endif - static_assert(int(dim_0::rank) == int(0), ""); - static_assert(int(dim_0::rank_dynamic) == int(0), ""); - static_assert(int(dim_0::ArgN0) == 1, ""); - static_assert(int(dim_0::ArgN1) == 1, ""); - static_assert(int(dim_0::ArgN2) == 1, ""); - - static_assert(int(dim_s2::rank) == int(1), ""); - static_assert(int(dim_s2::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2::ArgN0) == 2, ""); - static_assert(int(dim_s2::ArgN1) == 1, ""); - - static_assert(int(dim_s2_s3::rank) == int(2), ""); - static_assert(int(dim_s2_s3::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3::ArgN2) == 1, ""); - - static_assert(int(dim_s2_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3_s4::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3_s4::ArgN2) == 4, ""); - static_assert(int(dim_s2_s3_s4::ArgN3) == 1, ""); - - static_assert(int(dim_s0::rank) == int(1), ""); - static_assert(int(dim_s0::rank_dynamic) == int(1), ""); - - static_assert(int(dim_s0_s3::rank) == int(2), ""); - static_assert(int(dim_s0_s3::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3::ArgN1) == 3, ""); - - static_assert(int(dim_s0_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s0_s3_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), ""); - static_assert(int(dim_s0_s0_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN1) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s0::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), ""); - - static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), ""); - static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), ""); - static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), ""); + static_assert(int(dim_0::rank) == int(0)); + static_assert(int(dim_0::rank_dynamic) == int(0)); + static_assert(int(dim_0::ArgN0) == 1); + static_assert(int(dim_0::ArgN1) == 1); + static_assert(int(dim_0::ArgN2) == 1); + + static_assert(int(dim_s2::rank) == int(1)); + static_assert(int(dim_s2::rank_dynamic) == int(0)); + static_assert(int(dim_s2::ArgN0) == 2); + static_assert(int(dim_s2::ArgN1) == 1); + + static_assert(int(dim_s2_s3::rank) == int(2)); + static_assert(int(dim_s2_s3::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3::ArgN0) == 2); + static_assert(int(dim_s2_s3::ArgN1) == 3); + static_assert(int(dim_s2_s3::ArgN2) == 1); + + static_assert(int(dim_s2_s3_s4::rank) == int(3)); + static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3_s4::ArgN0) == 2); + static_assert(int(dim_s2_s3_s4::ArgN1) == 3); + static_assert(int(dim_s2_s3_s4::ArgN2) == 4); + static_assert(int(dim_s2_s3_s4::ArgN3) == 1); + + static_assert(int(dim_s0::rank) == int(1)); + static_assert(int(dim_s0::rank_dynamic) == int(1)); + + static_assert(int(dim_s0_s3::rank) == int(2)); + static_assert(int(dim_s0_s3::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3::ArgN0) == 0); + static_assert(int(dim_s0_s3::ArgN1) == 3); + + static_assert(int(dim_s0_s3_s4::rank) == int(3)); + static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3_s4::ArgN0) == 0); + static_assert(int(dim_s0_s3_s4::ArgN1) == 3); + static_assert(int(dim_s0_s3_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s4::rank) == int(3)); + static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2)); + static_assert(int(dim_s0_s0_s4::ArgN0) == 0); + static_assert(int(dim_s0_s0_s4::ArgN1) == 0); + static_assert(int(dim_s0_s0_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s0::rank) == int(3)); + static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3)); + + static_assert(int(dim_s0_s0_s0_s0::rank) == int(4)); + static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4)); + + static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5)); + static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8)); dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9); dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9); @@ -514,11 +514,11 @@ void test_view_mapping() { { using namespace Kokkos::Impl; - static_assert(rank_dynamic<>::value == 0, ""); - static_assert(rank_dynamic<1>::value == 0, ""); - static_assert(rank_dynamic<0>::value == 1, ""); - static_assert(rank_dynamic<0, 1>::value == 1, ""); - static_assert(rank_dynamic<0, 0, 1>::value == 2, ""); + static_assert(rank_dynamic<>::value == 0); + static_assert(rank_dynamic<1>::value == 0); + static_assert(rank_dynamic<0>::value == 1); + static_assert(rank_dynamic<0, 1>::value == 1); + static_assert(rank_dynamic<0, 0, 1>::value == 2); } { @@ -529,54 +529,48 @@ void test_view_mapping() { using a_const_int_r1 = ViewArrayAnalysis; using a_const_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r1::dimension::rank == 1, ""); - static_assert(a_int_r1::dimension::rank_dynamic == 1, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_int_r1::dimension::rank == 1); + static_assert(a_int_r1::dimension::rank_dynamic == 1); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 4); + static_assert(a_int_r5::dimension::ArgN3 == 5); + static_assert(a_int_r5::dimension::ArgN4 == 6); + static_assert(a_int_r5::dimension::ArgN5 == 1); static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); - static_assert(a_const_int_r1::dimension::rank == 1, ""); - static_assert(a_const_int_r1::dimension::rank_dynamic == 1, ""); + static_assert(a_const_int_r1::dimension::rank == 1); + static_assert(a_const_int_r1::dimension::rank_dynamic == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0> >::value); + static_assert(std::is_same::value); - static_assert(a_const_int_r5::dimension::rank == 5, ""); - static_assert(a_const_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_const_int_r5::dimension::rank == 5); + static_assert(a_const_int_r5::dimension::rank_dynamic == 2); - static_assert(a_const_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_const_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_const_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_const_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_const_int_r5::dimension::ArgN0 == 0); + static_assert(a_const_int_r5::dimension::ArgN1 == 0); + static_assert(a_const_int_r5::dimension::ArgN2 == 4); + static_assert(a_const_int_r5::dimension::ArgN3 == 5); + static_assert(a_const_int_r5::dimension::ArgN4 == 6); + static_assert(a_const_int_r5::dimension::ArgN5 == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); + static_assert(std::is_same::value); - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 2); static_assert(std::is_same >::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -587,15 +581,15 @@ void test_view_mapping() { // Dimensions of t_i4 are appended to the multdimensional array. using a_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 3, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 0, ""); - static_assert(a_int_r5::dimension::ArgN3 == 3, ""); - static_assert(a_int_r5::dimension::ArgN4 == 4, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 3); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 0); + static_assert(a_int_r5::dimension::ArgN3 == 3); + static_assert(a_int_r5::dimension::ArgN4 == 4); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -603,71 +597,54 @@ void test_view_mapping() { using a_const_int_r1 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, ""); - static_assert( - std::is_same::value, - ""); + std::is_same::value); + static_assert(std::is_same::value); using a_const_int_r3 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0, 0, 4> >::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); - static_assert( - std::is_same::value, - ""); + int* * [4]>::value); + static_assert(std::is_same::value); static_assert( std::is_same::value, - ""); + int* * [4]>::value); // std::cout << "typeid( const int**[4] ).name() = " << typeid( const // int**[4] ).name() << std::endl; diff --git a/core/unit_test/TestViewMapping_b.hpp b/core/unit_test/TestViewMapping_b.hpp index 9ac4e7da845..4aee035d17a 100644 --- a/core/unit_test/TestViewMapping_b.hpp +++ b/core/unit_test/TestViewMapping_b.hpp @@ -156,7 +156,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -167,7 +167,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -180,7 +180,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -193,7 +193,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -206,7 +206,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } { // Assignment of rank-2 Right = Left @@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } } @@ -226,7 +226,7 @@ TEST(TEST_CATEGORY, view_mapping_trivially_copyable) { using src_traits = dst_traits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(std::is_trivially_copyable{}, ""); + static_assert(std::is_trivially_copyable{}); } } // namespace Test diff --git a/core/unit_test/cuda/TestCuda_Spaces.cpp b/core/unit_test/cuda/TestCuda_Spaces.cpp index ae603101abb..11fe6b8555b 100644 --- a/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -29,200 +29,166 @@ __global__ void test_cuda_spaces_int_value(int *ptr) { TEST(cuda, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert( - !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaSpace>::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( std::is_same::Space, - Kokkos::CudaHostPinnedSpace>::value, - ""); + Kokkos::CudaHostPinnedSpace>::value); static_assert(std::is_same, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); - static_assert( - Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + static_assert(Kokkos::SpaceAccessibility< + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); #ifdef KOKKOS_ENABLE_CUDA_UVM using uvm_view = Kokkos::View; static_assert(std::is_same::Space; static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); } } // namespace Test diff --git a/core/unit_test/hip/TestHIP_Spaces.cpp b/core/unit_test/hip/TestHIP_Spaces.cpp index 14fd4e28837..8f7499c244b 100644 --- a/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/core/unit_test/hip/TestHIP_Spaces.cpp @@ -29,198 +29,164 @@ __global__ void test_hip_spaces_int_value(int *ptr) { TEST(hip, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, - Kokkos::HIPHostPinnedSpace>::value, - ""); + Kokkos::HIPHostPinnedSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::HIPManagedSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); } template diff --git a/core/unit_test/sycl/TestSYCL_Spaces.cpp b/core/unit_test/sycl/TestSYCL_Spaces.cpp index 914f8432488..a4fd053e83d 100644 --- a/core/unit_test/sycl/TestSYCL_Spaces.cpp +++ b/core/unit_test/sycl/TestSYCL_Spaces.cpp @@ -21,235 +21,192 @@ namespace Test { TEST(sycl, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same< Kokkos::Impl::HostMirror< Kokkos::Experimental::SYCLSharedUSMSpace>::Space, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::Experimental::SYCLHostUSMSpace>::value, - ""); + Kokkos::Experimental::SYCLHostUSMSpace>::value); static_assert( std::is_same< Kokkos::Device, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLSharedUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLSharedUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLHostUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLHostUSMSpace>::Space, + Kokkos::HostSpace>::accessible); } TEST(sycl, uvm) { diff --git a/core/unit_test/tools/TestProfilingSection.cpp b/core/unit_test/tools/TestProfilingSection.cpp index 318766ac455..9d35d67feb0 100644 --- a/core/unit_test/tools/TestProfilingSection.cpp +++ b/core/unit_test/tools/TestProfilingSection.cpp @@ -108,8 +108,8 @@ TEST(defaultdevicetype, profiling_section) { } using Kokkos::Profiling::ProfilingSection; -static_assert(!std::is_default_constructible::value, ""); -static_assert(!std::is_copy_constructible::value, ""); -static_assert(!std::is_move_constructible::value, ""); -static_assert(!std::is_copy_assignable::value, ""); -static_assert(!std::is_move_assignable::value, ""); +static_assert(!std::is_default_constructible::value); +static_assert(!std::is_copy_constructible::value); +static_assert(!std::is_move_constructible::value); +static_assert(!std::is_copy_assignable::value); +static_assert(!std::is_move_assignable::value); From cead4f559407885a3ccb00621d9b04e15bef5c8b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 Nov 2023 10:46:43 -0400 Subject: [PATCH 110/432] [ci skip] Drop unused header --- core/src/impl/Kokkos_Memory_Fence.hpp | 54 --------------------------- 1 file changed, 54 deletions(-) delete mode 100644 core/src/impl/Kokkos_Memory_Fence.hpp diff --git a/core/src/impl/Kokkos_Memory_Fence.hpp b/core/src/impl/Kokkos_Memory_Fence.hpp deleted file mode 100644 index 42a53b04fb2..00000000000 --- a/core/src/impl/Kokkos_Memory_Fence.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_MEMORY_FENCE_HPP) -#define KOKKOS_MEMORY_FENCE_HPP -namespace Kokkos { - -////////////////////////////////////////////////////// -// store_fence() -// -// If possible use a store fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void store_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("sfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -////////////////////////////////////////////////////// -// load_fence() -// -// If possible use a load fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void load_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("lfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -} // namespace Kokkos - -#endif From 1e1ed13181261fe1d1affb308dcbdd0a10ef17a6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 Nov 2023 13:19:28 -0400 Subject: [PATCH 111/432] Drop Clang+CUDA workaround --- Makefile.kokkos | 4 ---- cmake/KokkosCore_config.h.in | 1 - cmake/kokkos_arch.cmake | 3 --- core/unit_test/TestTeamVector.hpp | 4 ---- 4 files changed, 12 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index e5d5d865ccc..7c1914e3076 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -687,10 +687,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND") - endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 8ef464be33c..3713d269fa0 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -59,7 +59,6 @@ #cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH -#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND #cmakedefine KOKKOS_ENABLE_ONEDPL #cmakedefine KOKKOS_ARCH_SSE42 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 924a2b8bcbf..5d857f42fdc 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -191,9 +191,6 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ELSEIF(CUDAToolkit_BIN_DIR) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) ENDIF() - IF (KOKKOS_ENABLE_CUDA) - SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) - ENDIF() ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SET(CUDA_ARCH_FLAG "-gpu") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 39122736ed7..5e16539d652 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -1012,7 +1012,6 @@ struct checkScan { }; } // namespace VectorScanReducer -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(0))); ASSERT_TRUE((TestTeamVector::Test(1))); @@ -1028,9 +1027,7 @@ TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(11))); ASSERT_TRUE((TestTeamVector::Test(12))); } -#endif -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, triple_nested_parallelism) { // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80 @@ -1055,7 +1052,6 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) { TestTripleNestedReduce(8192, 2048, 16, 16); TestTripleNestedReduce(8192, 2048, 7, 16); } -#endif TEST(TEST_CATEGORY, parallel_scan_with_reducers) { using T = double; From a7b16b351154d248a9b206a18e6c8692c1b52662 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Tue, 7 Nov 2023 10:54:46 -0800 Subject: [PATCH 112/432] OpenMPTarget: CI compiler upgrade. (#6545) * OpenMPTarget: CI compiler upgrade. * OpenMPTarget: Only block unit tests for 17.0.3. * OpenMPTarget: Set minimum clang version. * OpenMPTarget: update compiler version that block unit test. * OpenMPTarget: Block unit tests based on compiler. --------- Co-authored-by: Rahulkumar Gayatri --- cmake/kokkos_compiler_id.cmake | 5 +++++ core/unit_test/CMakeLists.txt | 17 +++++++++-------- scripts/docker/Dockerfile.openmptarget | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/cmake/kokkos_compiler_id.cmake b/cmake/kokkos_compiler_id.cmake index 04589befc3a..9135ca2b41c 100644 --- a/cmake/kokkos_compiler_id.cmake +++ b/cmake/kokkos_compiler_id.cmake @@ -152,6 +152,7 @@ ENDIF() SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) 8.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) 10.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) 15.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 8.2.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 19.0.5 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) 2021.1.1 or higher") @@ -210,6 +211,10 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() ENDIF() IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index e280f4c7ef1..38dc1364768 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -373,20 +373,21 @@ if(Kokkos_ENABLE_OPENMPTARGET) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp - IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp - endif() IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp + IF (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.3) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + endif() endif() # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler. # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures. diff --git a/scripts/docker/Dockerfile.openmptarget b/scripts/docker/Dockerfile.openmptarget index 708cf533b8a..22edcda2a07 100644 --- a/scripts/docker/Dockerfile.openmptarget +++ b/scripts/docker/Dockerfile.openmptarget @@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO rm ${CMAKE_SCRIPT} ENV PATH=${CMAKE_DIR}/bin:$PATH -ARG LLVM_VERSION=llvmorg-17.0.1 +ARG LLVM_VERSION=llvmorg-17.0.3 ENV LLVM_DIR=/opt/llvm RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\ LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\ From 403c34f304077eeb92dc8fd8cb2fd24ae6c35688 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 7 Nov 2023 14:00:46 -0500 Subject: [PATCH 113/432] m_cudaDev isn't static anymore --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 78e82df1005..045b91771ba 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -686,7 +686,7 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; - Impl::CudaInternal::m_cudaDev = cuda_device_id; + m_cudaDev = cuda_device_id; Impl::CudaInternal::m_deviceProp = cudaProp; Kokkos::Impl::cuda_device_synchronize( From 8d9400e294329a08b7728809289cf4daec0548c4 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 7 Nov 2023 12:04:02 -0700 Subject: [PATCH 114/432] Add crtrott's launch_latency benchmark (#6379) * Add crtrott's launch_latency benchmark * benchmarks/launch_latency: allow measurements to be skipped * launch_latency.cpp: newline * benchmarks/launch_latency: loops instead of copy-paste, additional comment * Update benchmarks/launch_latency/launch_latency.cpp Co-authored-by: Bruno Turcksin * benchmark/launch_latency: Fix N,M,K parsing, remove redundant measure, add copyright * benchmark/launch_latency: move copyright header to the top * benchmarks/launch_latency: remove redundant fence --------- Co-authored-by: Bruno Turcksin --- benchmarks/CMakeLists.txt | 1 + benchmarks/launch_latency/CMakeLists.txt | 4 + benchmarks/launch_latency/launch_latency.cpp | 283 +++++++++++++++++++ 3 files changed, 288 insertions(+) create mode 100644 benchmarks/launch_latency/CMakeLists.txt create mode 100644 benchmarks/launch_latency/launch_latency.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 42279bf55db..cccf7c759e1 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1 +1,2 @@ KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) diff --git a/benchmarks/launch_latency/CMakeLists.txt b/benchmarks/launch_latency/CMakeLists.txt new file mode 100644 index 00000000000..bb14da749d1 --- /dev/null +++ b/benchmarks/launch_latency/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + launch_latency + SOURCES launch_latency.cpp +) diff --git a/benchmarks/launch_latency/launch_latency.cpp b/benchmarks/launch_latency/launch_latency.cpp new file mode 100644 index 00000000000..73b176ab8dd --- /dev/null +++ b/benchmarks/launch_latency/launch_latency.cpp @@ -0,0 +1,283 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file launch_latency.cpp + + Tests of parallel_for and parallel_reduce latency for different + circumstances. + + Three launch kinds are tested: parallel_for, parallel_reduce into scalar, + and parallel_reduce into view + + N controls how large the parallel loops is + V controls how large the functor is + M controls across how many launches the latency is averaged + K controls how larege the nested loop is (no larger than V) + + For each launch kind, + 1. Avg functor dispatch latency: (time to do M launches) / M + 2. Avg functor completion throughput: (M launches + sync) / M + 3. Avg functor completion latency: (M (launch + sync)) / M +*/ + +#include + +template +struct TestFunctor { + double values[V]; + Kokkos::View a; + int K; + TestFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + } +}; + +template +struct TestRFunctor { + double values[V]; + Kokkos::View a; + int K; + TestRFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double& lsum) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + lsum += a(i); + } +}; + +struct Opts { + bool par_for = true; + bool par_reduce = true; + bool par_reduce_view = true; +}; + +template +void run(int N, int M, int K, const Opts& opts) { + std::string l_no_fence, l_fence, l_red_no_fence, l_red_fence, + l_red_view_no_fence, l_red_view_fence; + { + std::ostringstream ostream; + ostream << "RunNoFence_" << N << "_" << K << std::endl; + l_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunFence_" << N << "_" << K << std::endl; + l_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceNoFence_" << N << "_" << K << std::endl; + l_red_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceFence_" << N << "_" << K << std::endl; + l_red_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewNoFence_" << N << "_" << K << std::endl; + l_red_view_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewFence_" << N << "_" << K << std::endl; + l_red_view_fence = ostream.str(); + } + + double result; + Kokkos::View a("A", N); + Kokkos::View v_result("result"); + TestFunctor f(a, K); + TestRFunctor rf(a, K); + Kokkos::Timer timer; + + // initialize to an obviously wrong value + double time_no_fence = -1; // launch loop + double time_no_fence_fenced = -1; // launch loop then fence + double time_fence = -1; // launch&fence loop + + double time_red_no_fence = -1; + double time_red_no_fence_fenced = -1; + double time_red_fence = -1; + + double time_red_view_no_fence = -1; + double time_red_view_no_fence_fenced = -1; + double time_red_view_fence = -1; + + if (opts.par_for) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_for(l_no_fence, N, f); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_no_fence, N, f); + } + time_no_fence = timer.seconds(); + Kokkos::fence(); + time_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_fence, N, f); + Kokkos::fence(); + } + time_fence = timer.seconds(); + } + + if (opts.par_reduce) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + time_red_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_fence, N, rf, result); + Kokkos::fence(); + } + time_red_fence = timer.seconds(); + Kokkos::fence(); + } + + if (opts.par_reduce_view) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + time_red_view_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_view_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_fence, N, rf, v_result); + Kokkos::fence(); + } + time_red_view_fence = timer.seconds(); + Kokkos::fence(); + timer.reset(); + } + + const double x = 1.e6 / M; + printf("%i %i %i %i", N, V, K, M); + if (opts.par_for) { + printf(" parallel_for: %lf %lf ( %lf )", x * time_no_fence, x * time_fence, + x * time_no_fence_fenced); + } + if (opts.par_reduce) { + printf(" parallel_reduce: %lf %lf ( %lf )", x * time_red_no_fence, + x * time_red_fence, x * time_red_no_fence_fenced); + } + if (opts.par_reduce_view) { + printf(" parallel_reduce(view): %lf %lf ( %lf )", + x * time_red_view_no_fence, x * time_red_view_fence, + x * time_red_view_no_fence_fenced); + } + printf("\n"); +} +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int N = 10000; + int M = 20; + int K = 1; + + Opts opts; + + printf("==========================\n"); + printf("Kokkos Launch Latency Test\n"); + printf("==========================\n"); + printf("\n"); + printf("Usage: %s ARGUMENTS [OPTIONS...]\n\n", argv[0]); + printf("Arguments: N M K\n"); + printf(" N: loop length\n"); + printf(" M: how many kernels to dispatch\n"); + printf( + " K: nested loop length (capped by size of functor member array\n\n"); + printf("Options:\n"); + printf(" --no-parallel-for: skip parallel_for benchmark\n"); + printf(" --no-parallel-reduce: skip parallel_reduce benchmark\n"); + printf( + " --no-parallel-reduce-view: skip parallel_reduce into view " + "benchmark\n"); + printf("\n\n"); + printf(" Output V is the size of the functor member array\n"); + printf("\n\n"); + + for (int i = 1; i < argc; ++i) { + const std::string_view arg(argv[i]); + + // anything that doesn't start with -- + if (arg.size() < 2 || + (arg.size() >= 2 && arg[0] != '-' && arg[1] != '-')) { + if (i == 1) + N = atoi(arg.data()); + else if (i == 2) + M = atoi(arg.data()); + else if (i == 3) + K = atoi(arg.data()); + else { + throw std::runtime_error("unexpected argument!"); + } + } else if (arg == "--no-parallel-for") { + opts.par_for = false; + } else if (arg == "--no-parallel-reduce") { + opts.par_reduce = false; + } else if (arg == "--no-parallel-reduce-view") { + opts.par_reduce_view = false; + } else { + std::stringstream ss; + ss << "unexpected argument \"" << arg << "\" at position " << i; + throw std::runtime_error(ss.str()); + } + } + + printf("N V K M time_no_fence time_fence (time_no_fence_fenced)\n"); + + /* A backend may have different launch strategies for functors of different + * sizes: test a variety of functor sizes.*/ + run<1>(N, M, K <= 1 ? K : 1, opts); + run<16>(N, M, K <= 16 ? K : 16, opts); + run<200>(N, M, K <= 200 ? K : 200, opts); + run<3000>(N, M, K <= 3000 ? K : 3000, opts); + run<30000>(N, M, K <= 30000 ? K : 30000, opts); + } + Kokkos::finalize(); +} From a453e9fc32fb605085017568258f98b141bdb686 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Tue, 7 Nov 2023 14:06:58 -0500 Subject: [PATCH 115/432] Simplify fence functions in the Threads backend (#6571) * Simplify fence functions in the Threads backend * Remove extra blank space --- core/src/Threads/Kokkos_Threads.hpp | 8 --- core/src/Threads/Kokkos_Threads_Instance.cpp | 53 +++++++------------- core/src/Threads/Kokkos_Threads_Instance.hpp | 12 ++--- 3 files changed, 24 insertions(+), 49 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp index 558d5a93984..36a66230be0 100644 --- a/core/src/Threads/Kokkos_Threads.hpp +++ b/core/src/Threads/Kokkos_Threads.hpp @@ -38,14 +38,6 @@ static_assert(false, /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { -enum class fence_is_static { yes, no }; -} // namespace Impl -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /** \brief Execution space for a pool of C++11 threads on a CPU. */ diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 2a60a54d893..f5a97000000 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -253,46 +253,29 @@ int ThreadsInternal::in_parallel() { return s_current_function && (&s_threads_process != s_current_function_arg) && (s_threads_process.m_pool_base || !is_process()); } -void ThreadsInternal::fence() { internal_fence(Impl::fence_is_static::yes); } -void ThreadsInternal::fence(const std::string &name) { - internal_fence(name, Impl::fence_is_static::yes); +void ThreadsInternal::fence() { + fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence"); } - -void ThreadsInternal::internal_fence(Impl::fence_is_static is_static) { - internal_fence((is_static == Impl::fence_is_static::no) - ? "Kokkos::ThreadsInternal::fence: Unnamed Instance Fence" - : "Kokkos::ThreadsInternal::fence: Unnamed Static Fence", - is_static); +void ThreadsInternal::fence(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + internal_fence); } // Wait for root thread to become inactive -void ThreadsInternal::internal_fence(const std::string &name, - Impl::fence_is_static is_static) { - const auto &fence_lam = [&]() { - if (s_thread_pool_size[0]) { - // Wait for the root thread to complete: - Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, - ThreadState::Active); - } +void ThreadsInternal::internal_fence() { + if (s_thread_pool_size[0]) { + // Wait for the root thread to complete: + Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, + ThreadState::Active); + } - s_current_function = nullptr; - s_current_function_arg = nullptr; + s_current_function = nullptr; + s_current_function_arg = nullptr; - // Make sure function and arguments are cleared before - // potentially re-activating threads with a subsequent launch. - memory_fence(); - }; - if (is_static == Impl::fence_is_static::yes) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - fence_lam); - } else { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - fence_lam); - } + // Make sure function and arguments are cleared before + // potentially re-activating threads with a subsequent launch. + memory_fence(); } /** \brief Begin execution of the asynchronous functor */ @@ -710,7 +693,7 @@ int Threads::concurrency() const { return impl_thread_pool_size(0); } #endif void Threads::fence(const std::string &name) const { - Impl::ThreadsInternal::internal_fence(name, Impl::fence_is_static::no); + Impl::ThreadsInternal::fence(name); } Threads &Threads::impl_instance(int) { diff --git a/core/src/Threads/Kokkos_Threads_Instance.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp index 074331bcaf1..b79b527940a 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -405,11 +405,7 @@ class ThreadsInternal { static int in_parallel(); static void fence(); static void fence(const std::string &); - static void internal_fence( - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static void internal_fence( - const std::string &, - Impl::fence_is_static is_static = Impl::fence_is_static::yes); + static void internal_fence(); /* Dynamic Scheduling related functionality */ // Initialize the work range for this thread @@ -572,7 +568,11 @@ inline void Threads::print_configuration(std::ostream &os, bool verbose) const { } inline void Threads::impl_static_fence(const std::string &name) { - Impl::ThreadsInternal::internal_fence(name, Impl::fence_is_static::yes); + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + Impl::ThreadsInternal::internal_fence); } } /* namespace Kokkos */ From 80084960cbfa700f2dac8c8ffa02585eb3cbcb50 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:34:19 -0700 Subject: [PATCH 116/432] simd: temporarily skip device math ops unit test for OpenMPTarget build (#6574) * Temporary disable to prevent further CI failures * Added a description for the test skip * Update simd/unit_tests/include/TestSIMD_MathOps.hpp Co-authored-by: Damien L-G * Update simd/unit_tests/include/TestSIMD_MathOps.hpp Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- simd/unit_tests/include/TestSIMD_MathOps.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/simd/unit_tests/include/TestSIMD_MathOps.hpp b/simd/unit_tests/include/TestSIMD_MathOps.hpp index fae17a07ace..6f8a8aa0f29 100644 --- a/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -294,6 +294,12 @@ TEST(simd, host_math_ops) { } TEST(simd, device_math_ops) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif Kokkos::parallel_for(1, simd_device_math_ops_functor()); } From d4a517f82d73a780d03ab06ece627b47d7aad43f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 7 Nov 2023 20:36:31 +0000 Subject: [PATCH 117/432] Set the device id explicitly for CUDA API calls in impl_initialize --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 045b91771ba..6d0f0707d82 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -685,12 +685,10 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { const auto &dev_info = Impl::CudaInternalDevices::singleton(); const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; + Impl::CudaInternal::m_deviceProp = cudaProp; - m_cudaDev = cuda_device_id; - Impl::CudaInternal::m_deviceProp = cudaProp; - - Kokkos::Impl::cuda_device_synchronize( - "Kokkos::CudaInternal::initialize: Fence on space initialization"); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Query what compute capability architecture a kernel executes: Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); @@ -758,6 +756,7 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default //---------------------------------- cudaStream_t singleton_stream; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); Impl::CudaInternal::singleton().initialize(singleton_stream, From fcb0452d0d14ca865b057b7c14941106f3f374ff Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 Nov 2023 17:54:28 +0000 Subject: [PATCH 118/432] OpenMPTarget: Guard scratch memory usage in ParallelReduce --- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 1 + core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp | 1 + .../Kokkos_OpenMPTarget_ParallelReduce_Range.hpp | 6 +++++- .../Kokkos_OpenMPTarget_ParallelReduce_Team.hpp | 7 ++++++- .../OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp | 7 ++++++- 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 1902c38409a..ea434b39533 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -75,6 +75,7 @@ int* OpenMPTargetExec::m_lock_array = nullptr; uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; +std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; void OpenMPTargetExec::clear_scratch() { Kokkos::Experimental::OpenMPTargetSpace space; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index a84de76aad0..2a7063b966a 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -750,6 +750,7 @@ class OpenMPTargetExec { int64_t thread_local_bytes, int64_t league_size); static void* m_scratch_ptr; + static std::mutex m_mutex_scratch_ptr; static int64_t m_scratch_size; static int* m_lock_array; static uint64_t m_lock_size; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4452af3846d..caa568a8925 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,6 +55,9 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: @@ -105,7 +108,8 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} + m_result_ptr_num_elems(arg_result_view.size()), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index a302fa71511..8abffa47a43 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,6 +470,10 @@ class ParallelReduce m_scratch_memory_lock; + public: void execute() const { const FunctorType& functor = m_functor_reducer.get_functor(); @@ -517,7 +521,8 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 41e62ce6e6b..6878531730d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -438,6 +438,10 @@ class ParallelReduce m_scratch_memory_lock; + public: inline void execute() const { execute_tile( @@ -452,7 +456,8 @@ class ParallelReduce::accessible) {} + typename ViewType::memory_space>::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} template inline std::enable_if_t execute_tile(const FunctorType& functor, From 26464df04cc9fd24091b0fe719bfd85e2900ee97 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 Nov 2023 13:39:27 -0500 Subject: [PATCH 119/432] SYCL: Implement DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION path (#6534) * SYCL: Implement DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION path * Sync with desul * [ci skip] Try improving comments * Configure DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION instead of compiler flag * Print SYCL_EXT_ONEAPI_DEVICE_GLOBAL in configuration --- cmake/kokkos_arch.cmake | 13 ++- core/src/CMakeLists.txt | 3 + core/src/SYCL/Kokkos_SYCL.cpp | 5 ++ .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 2 + .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 2 + .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 2 + tpls/desul/Config.hpp.cmake.in | 1 + .../include/desul/atomics/Lock_Array_SYCL.hpp | 80 +++++++++++++++++-- tpls/desul/src/Lock_Array_SYCL.cpp | 21 ++--- 13 files changed, 113 insertions(+), 24 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 5d857f42fdc..920ce8eadfc 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -585,16 +585,20 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() # Check support for device_global variables -# FIXME_SYCL Even if SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we still can't -# use device global variables with shared libraries -IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) +# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device +# global variables with shared libraries using the "non-separable compilation" +# implementation. Otherwise, the feature is not supported when building shared +# libraries. Thus, we don't even check for support if shared libraries are +# requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. +IF(KOKKOS_ENABLE_SYCL) STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") INCLUDE(CheckCXXSymbolExists) CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) - ELSE() + ELSEIF(NOT BUILD_SHARED_LIBS) INCLUDE(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES(" #include @@ -614,6 +618,7 @@ IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + # Only the separable compilation implementation is supported. COMPILER_SPECIFIC_FLAGS( DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED ) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 012af0a7d06..a4edf1ba160 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -18,6 +18,9 @@ IF (NOT desul_FOUND) ENDIF() IF(KOKKOS_ENABLE_SYCL) SET(DESUL_ATOMICS_ENABLE_SYCL ON) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + ENDIF() ENDIF() IF(KOKKOS_ENABLE_OPENMPTARGET) SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 7fa935f693a..af64b6908d4 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -99,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n"; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index f4fada570b0..7fbf5420f83 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -118,6 +118,8 @@ class Kokkos::Impl::ParallelFor, const BarePolicy bare_policy(m_policy); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { const auto range = compute_ranges(); const sycl::range<3> global_range = range.get_global_range(); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 9c5767d209f..b4de7eb89ff 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -81,6 +81,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 1f2629407b0..f8abdf8443d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -59,6 +59,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index bc2e47658ed..953d2235b31 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -103,6 +103,8 @@ class Kokkos::Impl::ParallelReduce global_mem; sycl::device_ptr group_results; + desul::ensure_sycl_lock_arrays_on_device(q); + auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) { sycl::local_accessor num_teams_done(1, cgh); diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in index a7bc738191e..614c2352b9e 100644 --- a/tpls/desul/Config.hpp.cmake.in +++ b/tpls/desul/Config.hpp.cmake.in @@ -14,6 +14,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_HIP #cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_SYCL +#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP #endif diff --git a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp index 8216f9a797c..e1170ed2aae 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp @@ -57,14 +57,35 @@ void finalize_lock_arrays_sycl(sycl::queue q); * \brief This global variable in SYCL space is what kernels use to get access * to the lock arrays. * - * There is only one single instance of this global variable for the entire - * executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose - * declaration here must be extern). This one instance will be initialized - * by initialize_host_sycl_lock_arrays and need not be modified afterwards. + * When relocatable device code is enabled, there is only one single instance of this + * global variable for the entire executable, whose definition will be in + * Kokkos_SYCL_Locks.cpp (and whose declaration here must then be extern). This one + * instance will be initialized by initialize_host_sycl_lock_arrays and need not be + * modified afterwards. + * + * When relocatable device code is disabled, an instance of this variable will be + * created in every translation unit that sees this header file (we make this clear by + * marking it static, meaning no other translation unit can link to it). Since the + * Kokkos_SYCL_Locks.cpp translation unit cannot initialize the instances in other + * translation units, we must update this SYCL global variable based on the Host global + * variable prior to running any kernels that will use it. That is the purpose of the + * ensure_sycl_lock_arrays_on_device function. */ -SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_DEVICE; -SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_NODE; #define SYCL_SPACE_ATOMIC_MASK 0x1FFFF @@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref.exchange(0); } +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue q) { + static bool once = [&q]() { +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, + &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, + &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); +#else + auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; + auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + q.single_task([=] { + SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; + }); +#endif + return true; + }(); + (void)once; +} + #else // not supported template @@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) { inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); } inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); } + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue) { +} + #endif } // namespace Impl + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {} +#else +static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) { + Impl::copy_sycl_lock_arrays_to_device(q); +} +#endif + } // namespace desul #endif diff --git a/tpls/desul/src/Lock_Array_SYCL.cpp b/tpls/desul/src/Lock_Array_SYCL.cpp index 9e84c60e41a..6660c76e11a 100644 --- a/tpls/desul/src/Lock_Array_SYCL.cpp +++ b/tpls/desul/src/Lock_Array_SYCL.cpp @@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul::Impl { +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#endif int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; @@ -31,19 +33,7 @@ void init_lock_arrays_sycl(sycl::queue q) { SYCL_SPACE_ATOMIC_LOCKS_NODE_h = sycl::malloc_host(SYCL_SPACE_ATOMIC_MASK + 1, q); - // FIXME_SYCL Once supported, the following should be replaced by - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, - // &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, - // sizeof(int32_t*)); - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, - // &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, - // sizeof(int32_t*)); - auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; - auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; - q.single_task([=] { - SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; - SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; - }); + copy_sycl_lock_arrays_to_device(q); q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, 0, @@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl(sycl::queue q) { sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q); SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION + copy_sycl_lock_arrays_to_device(q); +#endif } -} // namespace desul::Impl +} // namespace desul::Impl #endif From c8b4fe848daf494ad9802040dc6850f9a939f19e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Nov 2023 09:11:54 -1000 Subject: [PATCH 120/432] Desul atomics: Trade SYCL-specific compile definition for a macro defintion in the configuration header --- cmake/kokkos_arch.cmake | 5 ++--- core/src/Kokkos_Atomics_Desul_Config.hpp | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 920ce8eadfc..986e1b5bfb8 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -595,9 +595,8 @@ IF(KOKKOS_ENABLE_SYCL) INCLUDE(CheckCXXSymbolExists) CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) # Use the non-separable compilation implementation to support shared libraries as well. - COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) ELSEIF(NOT BUILD_SHARED_LIBS) INCLUDE(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES(" @@ -620,7 +619,7 @@ IF(KOKKOS_ENABLE_SYCL) IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + DEFAULT -fsycl-device-code-split=off ) ENDIF() ENDIF() diff --git a/core/src/Kokkos_Atomics_Desul_Config.hpp b/core/src/Kokkos_Atomics_Desul_Config.hpp index 4cf170f5f13..0523b21c513 100644 --- a/core/src/Kokkos_Atomics_Desul_Config.hpp +++ b/core/src/Kokkos_Atomics_Desul_Config.hpp @@ -32,4 +32,8 @@ static_assert(false, #define DESUL_CUDA_ARCH_IS_PRE_VOLTA #endif +#if defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) +#define DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED +#endif + #endif // KOKKOS_ATOMICS_DESUL_CONFIG_HPP From 0a83695e5c53ee255c880bc2e10c0a4986dcc315 Mon Sep 17 00:00:00 2001 From: Shihab Shahriar Khan Date: Thu, 9 Nov 2023 14:06:51 -0500 Subject: [PATCH 121/432] Replace Marsaglia polar method with Box-muller to generate a normally distributed random number (#6556) * Kokkos Random: Replace Marsaglia polar method with Box-muller to generate a normally distributed random number * Apply clang-formatting * Add const qualifier to some internal variables * Update Kokkos_Random.hpp --- algorithms/src/Kokkos_Random.hpp | 34 +++++++++++++++----------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/algorithms/src/Kokkos_Random.hpp b/algorithms/src/Kokkos_Random.hpp index 2d7d236d2fc..89126609885 100644 --- a/algorithms/src/Kokkos_Random.hpp +++ b/algorithms/src/Kokkos_Random.hpp @@ -849,18 +849,17 @@ class Random_XorShift64 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1094,18 +1093,17 @@ class Random_XorShift1024 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION From 3f773d057b37ea59a1cfe3e0a098d5dbe157ee47 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 9 Nov 2023 11:38:07 -0800 Subject: [PATCH 122/432] OpenMP: No memset in viewfill (#6573) * OpenMP: Edit copyview implementation. * OpenMP: Use memset for Serial backend. * Update core/src/Kokkos_CopyViews.hpp OpenMP: do not use memset for 0's only if execution space is OpenMP Co-authored-by: Daniel Arndt * Fix incorrect code. --------- Co-authored-by: Rahulkumar Gayatri Co-authored-by: Daniel Arndt --- core/src/Kokkos_CopyViews.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index a0ca55be704..3f02748c9cc 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -1348,13 +1348,14 @@ inline std::enable_if_t< contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) + // With OpenMP, using memset has significant performance issues. + if (Impl::is_zero_byte(value) +#ifdef KOKKOS_ENABLE_OPENMP + && !std::is_same_v +#endif + ) ZeroMemset>(exec_space, dst, value); else -#endif contiguous_fill(exec_space, dst, value); } From 81e308e7da46aaa4ab040789a2715a7ebf99d200 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 Nov 2023 09:49:28 -1000 Subject: [PATCH 123/432] Revert "Desul atomics: Trade SYCL-specific compile definition for a macro defintion in the configuration header" --- cmake/kokkos_arch.cmake | 5 +++-- core/src/Kokkos_Atomics_Desul_Config.hpp | 4 ---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 986e1b5bfb8..920ce8eadfc 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -595,8 +595,9 @@ IF(KOKKOS_ENABLE_SYCL) INCLUDE(CheckCXXSymbolExists) CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - # Use the non-separable compilation implementation to support shared libraries as well. SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. + COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) ELSEIF(NOT BUILD_SHARED_LIBS) INCLUDE(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES(" @@ -619,7 +620,7 @@ IF(KOKKOS_ENABLE_SYCL) IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off + DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED ) ENDIF() ENDIF() diff --git a/core/src/Kokkos_Atomics_Desul_Config.hpp b/core/src/Kokkos_Atomics_Desul_Config.hpp index 0523b21c513..4cf170f5f13 100644 --- a/core/src/Kokkos_Atomics_Desul_Config.hpp +++ b/core/src/Kokkos_Atomics_Desul_Config.hpp @@ -32,8 +32,4 @@ static_assert(false, #define DESUL_CUDA_ARCH_IS_PRE_VOLTA #endif -#if defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) -#define DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED -#endif - #endif // KOKKOS_ATOMICS_DESUL_CONFIG_HPP From 97a90d5dd22afafebb5c0202ed8ab63724566304 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 Nov 2023 13:14:52 -1000 Subject: [PATCH 124/432] OpenACC: add atomics support (#6446) * Initial OpenACC atomic construct implementation. * Partially fixed bugs in the OpenACC atomic implementations. * First working version, where general atomic implementations work only on a sequential host * Update Unit Test CMake * Update as suggested by code review: - Remove const_cast() - Change Kokkos::abort() with printf() - Add FIXME_OPENACC comment. * Disable unsupporte OpenACC atomic tests. (OpenACC C/C++ does not support atomic max/min/mod operations) Disable TestOpenACC_BitManipulationBuiltins for OpenACC due to errors. * Apply ClangFormat * Disable unsupported unit tests when by old NVHPC compilers (V22.5 or older). * Apply ClangFormat * Update tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp Co-authored-by: Damien L-G * Restore unit tests that were disabled for old NVHPC compilers (V22.5 or older) * Update unit test CMakeLists.txt to include unit tests enabled by this PR. * Change the minimum version of the NVHPC compiler from 22.3 to 22.9 for the OpenACC backend. * Changed the way to guard unsupported atomic tests for the OpenACC backend. * Remove unnecessary guarding on unsupported atomic tests for the OpenACC backend. * Minor updates according to the code review. * Changed the supported-type-checking code from macro to alias template as suggested by the code review. * Undo changing the minimim required version of NVHPC. * Apply suggestions from code review Co-authored-by: Damien L-G * Change the KOKKOS_COMPILER_NVHPC macro to __NVCOMPILER * Rename a variable's name from `tmp` to `old in atomic_fetch_*() functions. * Change `ptr[0]` to `*ptr` as suggested by the code review. * Add FIXME comments in `device_atomic_thread_fence()`. * Moved definitions into the desul::Impl namespace as suggested by the code review. * Clean up the OpenACC atomic implementations. Re-enable atomic max/min tests for OpenACC. * Fix a typo (sid => std) * Minor bug fix on OpenACC * Update .jenkins to the latest. * Test this please * Try again * Fix typo * Deal with conflicts * Disable complex float atomic unit tests for OpenACC backend * Sync with PR opened on the desul side * DO NOT MERGE disable all CI but OpenACC * - Disable atomic-fetch-shift tests for NVHPC OpenACC compiler, which fail due to compiler bugs, which are reported to NVIDIA. - Change the values of start and end variables in TestAtomicOperations_double.hpp and TestAtomicOperations_float.hpp so that atomic-division tests calculate trivial divisions. (In the original tests, NVHPC compiler failed since device results are slightly different from the host results due to precision mismatch.) * Add atomic_op_test)rel() to TestAtomicOperations.hpp, which compares the host and device atomic operations using a relative error. * Revert "DO NOT MERGE disable all CI but OpenACC" This reverts commit 18132bffc0fd5491ed4c8191377cd8a9d1692910. * [desul_atomics] Fixup Kokkos -> DESUL in error message with OpenACC --------- Co-authored-by: Seyong Lee --- Makefile.kokkos | 6 + core/src/CMakeLists.txt | 3 + core/src/Kokkos_Macros.hpp | 2 +- core/unit_test/CMakeLists.txt | 19 +- core/unit_test/TestAtomicOperations.hpp | 74 +++ core/unit_test/TestAtomics.hpp | 4 +- tpls/desul/Config.hpp.cmake.in | 1 + .../desul/atomics/Compare_Exchange.hpp | 3 + .../atomics/Compare_Exchange_OpenACC.hpp | 149 ++++++ tpls/desul/include/desul/atomics/Fetch_Op.hpp | 3 + .../desul/atomics/Fetch_Op_OpenACC.hpp | 427 ++++++++++++++++++ tpls/desul/include/desul/atomics/Generic.hpp | 48 ++ .../desul/atomics/Lock_Based_Fetch_Op.hpp | 3 + .../atomics/Lock_Based_Fetch_Op_OpenACC.hpp | 77 ++++ tpls/desul/include/desul/atomics/Macros.hpp | 28 ++ .../include/desul/atomics/Thread_Fence.hpp | 3 + .../desul/atomics/Thread_Fence_OpenACC.hpp | 25 + 17 files changed, 855 insertions(+), 20 deletions(-) create mode 100644 tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp diff --git a/Makefile.kokkos b/Makefile.kokkos index 7c1914e3076..97b92a32892 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1440,6 +1440,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") +endif tmp := $(call desul_append_header, "") tmp := $(call desul_append_header, "$H""endif") diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index a4edf1ba160..b4a25c0813e 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -25,6 +25,9 @@ IF (NOT desul_FOUND) IF(KOKKOS_ENABLE_OPENMPTARGET) SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP ENDIF() + IF(KOKKOS_ENABLE_OPENACC) + SET(DESUL_ATOMICS_ENABLE_OPENACC ON) + ENDIF() CONFIGURE_FILE( ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index d32ab2e57b6..3f53fcba683 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -87,7 +87,7 @@ #if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) + !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENACC) #define KOKKOS_INTERNAL_NOT_PARALLEL #endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 38dc1364768..8c9a6684987 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -65,7 +65,7 @@ SET(KOKKOS_THREADS_NAME Threads) IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) SET(KOKKOS_OPENACC_FEATURE_LEVEL 9) ELSE() - SET(KOKKOS_OPENACC_FEATURE_LEVEL 16) + SET(KOKKOS_OPENACC_FEATURE_LEVEL 17) ENDIF() SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) @@ -524,17 +524,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp @@ -551,17 +541,10 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp ) endif() diff --git a/core/unit_test/TestAtomicOperations.hpp b/core/unit_test/TestAtomicOperations.hpp index a5aebed4138..cd7ba47aa1e 100644 --- a/core/unit_test/TestAtomicOperations.hpp +++ b/core/unit_test/TestAtomicOperations.hpp @@ -368,6 +368,63 @@ bool atomic_op_test(T old_val, T update) { return result == 0; } +template +constexpr T relative_error_threshold = T(1.0e-15); + +template +bool atomic_op_test_rel(T old_val, T update) { + Kokkos::View op_data("op_data"); + Kokkos::deep_copy(op_data, old_val); + int result = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, 1), + KOKKOS_LAMBDA(int, int& local_result) { + auto fetch_result = + Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update); + T expected_val = Op::op(old_val, update); + Kokkos::memory_fence(); + if (expected_val == T(0)) { + if (fabs(op_data(0)) > relative_error_threshold) local_result += 1; + if (fabs(op_data(1)) > relative_error_threshold) local_result += 2; + if (fabs(op_data(2)) > relative_error_threshold) local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs(fetch_result.second) > relative_error_threshold) + local_result += 16; + } else { + if (fabs((op_data(0) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 1; + if (fabs((op_data(1) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 2; + if (fabs((op_data(2) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs((fetch_result.second - expected_val) / expected_val) > + relative_error_threshold) + local_result += 16; + } + }, + result); + if ((result & 1) != 0) + printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name()); + if ((result & 2) != 0) + printf("atomic_fetch_%s failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 4) != 0) + printf("atomic_%s_fetch failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 8) != 0) + printf("atomic_fetch_%s did not return old value with type %s\n", + Op::name(), typeid(T).name()); + if ((result & 16) != 0) + printf("atomic_%s_fetch did not return updated value with type %s\n", + Op::name(), typeid(T).name()); + + return result == 0; +} + //--------------------------------------------------- //--------------atomic_test_control------------------ //--------------------------------------------------- @@ -395,6 +452,12 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { case 9: return atomic_op_test(old_val, update); case 10: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // FIXME_NVHPC: atomic-fetch-shift operation fails due to NVHPC OpenACC + // compiler bugs, which are reported to NVIDIA. + case 11: return true; + case 12: return true; +#else case 11: return update_in >= 0 ? atomic_op_test( old_val, update) @@ -403,6 +466,7 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { return update_in >= 0 ? atomic_op_test( old_val, update) : true; +#endif case 13: return atomic_op_test(old_val, update); case 14: @@ -440,10 +504,20 @@ bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in, case 2: return atomic_op_test(old_val, update); case 3: return atomic_op_test(old_val, update); case 4: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // NVHPC may use different internal precisions for the device and host + // atomic operations. Therefore, relative errors are used to compare the + // host results and device results. + case 5: + return update != 0 ? atomic_op_test_rel( + old_val, update) + : true; +#else case 5: return update != 0 ? atomic_op_test(old_val, update) : true; +#endif case 6: return atomic_op_test(old_val, update); } diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index 2b40f12d0a4..5f48e8c9746 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -498,7 +498,9 @@ TEST(TEST_CATEGORY, atomics) { ASSERT_TRUE((TestAtomic::Loop(100, 2))); ASSERT_TRUE((TestAtomic::Loop(100, 3))); -#ifndef KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET + // FIXME_OPENACC: atomic operations on composite types are not supported. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 1))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 2))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 3))); diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in index 614c2352b9e..aed7ecfabc9 100644 --- a/tpls/desul/Config.hpp.cmake.in +++ b/tpls/desul/Config.hpp.cmake.in @@ -16,5 +16,6 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_SYCL #cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP +#cmakedefine DESUL_ATOMICS_ENABLE_OPENACC #endif diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp index e91569e1dee..72639fc4932 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp new file mode 100644 index 00000000000..225079c15db --- /dev/null +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ +#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ + +#include + +#include +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) { + if constexpr (std::is_arithmetic_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // Floating point types treated separetely to work around compiler errors + // "parse invalid cast opcode for cast from 'i32' to 'float'". + // Also not just "forwarding" arguments to atomicCAS because it does not have an + // overload that takes int64_t + if constexpr (std::is_integral_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + static_assert(sizeof(unsigned int) == 4); + static_assert(sizeof(unsigned long long int) == 8); + using cas_t = + std::conditional_t<(sizeof(T) == 4), unsigned int, unsigned long long int>; + cas_t return_val = atomicCAS(reinterpret_cast(dest), + reinterpret_cast(compare), + reinterpret_cast(value)); + return reinterpret_cast(return_val); +#ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL + } else if constexpr (std::is_same_v) { +#else + } else if constexpr (std::is_same_v || std::is_same_v) { +#endif + return atomicCAS(dest, compare, value); + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic " + "operation in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; + } +} + +#else // not NVHPC + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) { + if constexpr (std::is_arithmetic_v) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic operation " + "in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; +} + +#endif + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op.hpp b/tpls/desul/include/desul/atomics/Fetch_Op.hpp index adf75c57437..1b161397c74 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op.hpp @@ -23,6 +23,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp new file mode 100644 index 00000000000..10294c423f9 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp @@ -0,0 +1,427 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ +#ifndef DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ + +#include // min, max +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +template +inline constexpr bool is_openacc_integral_type_v = + std::is_same_v || std::is_same_v || + std::is_same_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_same_v || +#ifndef DESUL_CUDA_ARCH_IS_PRE_PASCAL + std::is_same_v || +#endif + is_openacc_integral_type_v; + +#else + +template +inline constexpr bool is_openacc_integral_type_v = std::is_integral_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_arithmetic_v; + +#endif + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_add( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_inc( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_sub( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_dec( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_mul( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr *= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_div( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr /= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_lshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr << val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_rshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr >> val; + } + return old; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_max( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; + old = atomicMax(ptr, val); + return old; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_min( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + int old; + old = atomicMin(ptr, val); + return old; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_and( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr &= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_or( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr |= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_xor( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr ^= val; + } + return old; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_add_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_inc_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_sub_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_dec_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_mul_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr *= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_div_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr /= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_lshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr << val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_rshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr >> val; + tmp = *ptr; + } + return tmp; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_max_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMax(ptr, val); + tmp = std::max(tmp, val); + return tmp; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_min_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMin(ptr, val); + tmp = std::min(tmp, val); + return tmp; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_and_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr &= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_or_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr |= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_xor_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr ^= val; + tmp = *ptr; + } + return tmp; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) { + printf( + "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " + "operation in the OpenACC backend\n"); +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) { + printf( + "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " + "operation in the OpenACC backend\n"); + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} +// + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Generic.hpp b/tpls/desul/include/desul/atomics/Generic.hpp index fef10222e34..fa71477c299 100644 --- a/tpls/desul/include/desul/atomics/Generic.hpp +++ b/tpls/desul/include/desul/atomics/Generic.hpp @@ -18,11 +18,14 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_thread_fence(MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_DEVICE(return Impl::device_atomic_thread_fence(order, scope);) DESUL_IF_ON_HOST(return Impl::host_atomic_thread_fence(order, scope);) } + +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { @@ -30,6 +33,7 @@ atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_HOST(return Impl::host_atomic_exchange(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope scope) { @@ -40,6 +44,7 @@ atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope sc } // Fetch_Oper atomics: return value before operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -47,6 +52,7 @@ atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -54,6 +60,7 @@ atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -61,6 +68,7 @@ atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -68,6 +76,7 @@ atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -75,6 +84,7 @@ atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -82,6 +92,7 @@ atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -89,6 +100,7 @@ atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -96,6 +108,7 @@ atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_and(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -103,6 +116,7 @@ atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_or(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -110,6 +124,7 @@ atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_xor(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -117,6 +132,7 @@ atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_nand(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, const unsigned int val, @@ -126,6 +142,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_lshift(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, const unsigned int val, @@ -136,6 +153,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, } // Oper Fetch atomics: return value after operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -143,6 +161,7 @@ atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_add_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -150,6 +169,7 @@ atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_sub_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -157,6 +177,7 @@ atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_max_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -164,6 +185,7 @@ atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_min_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -171,6 +193,7 @@ atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mul_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -178,6 +201,7 @@ atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_div_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -185,6 +209,7 @@ atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mod_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -192,6 +217,7 @@ atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_and_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -199,6 +225,7 @@ atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_or_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -206,6 +233,7 @@ atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_xor_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -213,6 +241,7 @@ atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_nand_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, const unsigned int val, @@ -222,6 +251,7 @@ DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_lshift_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, const unsigned int val, @@ -233,6 +263,7 @@ DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, // Other atomics +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, MemoryOrder order, @@ -241,6 +272,7 @@ DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_load(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_store(T* const dest, const T val, @@ -250,6 +282,7 @@ DESUL_INLINE_FUNCTION void atomic_store(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_store(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_add(T* const dest, const T val, @@ -259,6 +292,7 @@ DESUL_INLINE_FUNCTION void atomic_add(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, const T val, @@ -268,6 +302,7 @@ DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, const T val, @@ -277,6 +312,7 @@ DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_div(T* const dest, const T val, @@ -286,6 +322,7 @@ DESUL_INLINE_FUNCTION void atomic_div(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_min(T* const dest, const T val, @@ -295,6 +332,7 @@ DESUL_INLINE_FUNCTION void atomic_min(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_max(T* const dest, const T val, @@ -304,6 +342,7 @@ DESUL_INLINE_FUNCTION void atomic_max(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, MemoryOrder order, @@ -312,6 +351,7 @@ DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, MemoryOrder order, @@ -320,6 +360,7 @@ DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_dec_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, MemoryOrder order, @@ -328,6 +369,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -335,6 +377,7 @@ atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, MemoryOrder order, @@ -343,6 +386,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -350,6 +394,7 @@ atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, MemoryOrder order, @@ -358,6 +403,7 @@ DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, MemoryOrder order, @@ -367,6 +413,7 @@ DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, } // FIXME +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp new file mode 100644 index 00000000000..6b78ce39043 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ + +#include +#include +#include +#include + +namespace desul { +namespace Impl { + +template = 0> +inline T device_atomic_fetch_oper(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + printf( + "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = op.apply(return_val, val); + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +template = 0> +inline T device_atomic_oper_fetch(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + printf( + "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = op.apply(*dest, val); + *dest = return_val; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 3a14b93d323..d11beb0c805 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -57,6 +57,10 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_OPENMP_ATOMICS #endif +#if defined(DESUL_ATOMICS_ENABLE_OPENACC) +#define DESUL_HAVE_OPENACC_ATOMICS +#endif + // ONLY use GNUC atomics if not explicitly say to use OpenMP atomics #if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) #define DESUL_HAVE_GCC_ATOMICS @@ -123,6 +127,30 @@ static constexpr bool desul_impl_omp_on_host() { return false; } #endif #endif +#if defined(DESUL_HAVE_OPENACC_ATOMICS) +#include +#ifdef __NVCOMPILER +// FIXME_OPENACC We cannot determine in a constant expresion whether we are on host or +// on device with NVHPC. We use the device implementation on both sides. +#define DESUL_IF_ON_DEVICE(CODE) \ + { DESUL_IMPL_STRIP_PARENS(CODE) } +#define DESUL_IF_ON_HOST(CODE) \ + {} +#else +#define DESUL_IF_ON_DEVICE(CODE) \ + if constexpr (acc_on_device(acc_device_not_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#define DESUL_IF_ON_HOST(CODE) \ + if constexpr (acc_on_device(acc_device_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#endif +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE _Pragma("acc routine seq") +#else +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE +#endif + #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE) #if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ diff --git a/tpls/desul/include/desul/atomics/Thread_Fence.hpp b/tpls/desul/include/desul/atomics/Thread_Fence.hpp index 24078aae07f..6a741f6d478 100644 --- a/tpls/desul/include/desul/atomics/Thread_Fence.hpp +++ b/tpls/desul/include/desul/atomics/Thread_Fence.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp new file mode 100644 index 00000000000..a5c8aa1c8a7 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp @@ -0,0 +1,25 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ +#define DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ + +namespace desul { +namespace Impl { + +#pragma acc routine seq +template +void device_atomic_thread_fence(MemoryOrder, MemoryScope) { + // FIXME_OPENACC: The current OpenACC standard does not support explicit thread fence + // operations. +} + +} // namespace Impl +} // namespace desul + +#endif From d5a4802911318aebecbc775990dd198260ce2383 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 9 Nov 2023 21:52:37 -0500 Subject: [PATCH 125/432] Fix infinity, quiet_NaN, signaling_Nan, isfinite, isnan, isinf for half_t and bhalf_t (#6543) * Fix nvcc warning for non-trivial types in bit_cast * Introduce BitComparisonWrapper * Implement isnan, isfinite, isinf for half_t, bhalf_t with bit comparison * Fix infinity, quiet_NaN, signaling_NaN for half_t, bhalf_t * Improve tests * Disable TestCuda_WithoutInitializing for NVHPC * Define exponent/fraction_mask in FloatingPointWrapper.hpp * Minimize changes to TestMathematicalFunctions.hpp * Enable tests for inf, quiet_nan, signaling_nan for half_t and bhalf_t * Don't repeat storage class specifier in template specialization * Try inline constexpr and move definitons for the same type together * Disable numeric traits unit tests for NVHPC * Define comparison operators for BitComparisonWrapper * Fix TestNumericTraits, no constexpr consructor for [b]half_t --- containers/unit_tests/CMakeLists.txt | 5 + core/src/Kokkos_BitManipulation.hpp | 2 +- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 68 ++++++- .../Kokkos_Half_MathematicalFunctions.hpp | 80 +++++++- core/src/impl/Kokkos_Half_NumericTraits.hpp | 20 +- core/unit_test/TestMathematicalFunctions.hpp | 179 ++++++++++++++++-- core/unit_test/TestNumericTraits.hpp | 24 ++- 7 files changed, 342 insertions(+), 36 deletions(-) diff --git a/containers/unit_tests/CMakeLists.txt b/containers/unit_tests/CMakeLists.txt index b777581043d..e69e46bb6a8 100644 --- a/containers/unit_tests/CMakeLists.txt +++ b/containers/unit_tests/CMakeLists.txt @@ -46,6 +46,11 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() + # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 + if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + endif() + KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index f3baf71660e..f5653aaba34 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -115,7 +115,7 @@ bit_cast(From const& from) noexcept { return sycl::bit_cast(from); #else To to; - memcpy(&to, &from, sizeof(To)); + memcpy(static_cast(&to), static_cast(&from), sizeof(To)); return to; #endif } diff --git a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index b1ff643a71e..4a22898d168 100644 --- a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -18,6 +18,7 @@ #define KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_ #include +#include // bit_cast #include #include // istream & ostream for extraction and insertion ops @@ -215,10 +216,70 @@ cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x); /************************** END forward declarations **************************/ namespace Impl { + +template +struct BitComparisonWrapper { + std::uint16_t value; + + template + KOKKOS_FUNCTION friend bool operator==(BitComparisonWrapper a, Number b) { + return static_cast(a) == b; + } + + template + KOKKOS_FUNCTION friend bool operator!=(BitComparisonWrapper a, Number b) { + return static_cast(a) != b; + } + + template + KOKKOS_FUNCTION friend bool operator<(BitComparisonWrapper a, Number b) { + return static_cast(a) < b; + } + + template + KOKKOS_FUNCTION friend bool operator<=(BitComparisonWrapper a, Number b) { + return static_cast(a) <= b; + } + + template + KOKKOS_FUNCTION friend bool operator>(BitComparisonWrapper a, Number b) { + return static_cast(a) > b; + } + + template + KOKKOS_FUNCTION friend bool operator>=(BitComparisonWrapper a, Number b) { + return static_cast(a) >= b; + } +}; + +template +inline constexpr BitComparisonWrapper exponent_mask; +template +inline constexpr BitComparisonWrapper fraction_mask; + +#ifdef KOKKOS_IMPL_HALF_TYPE_DEFINED +template <> +inline constexpr BitComparisonWrapper + exponent_mask{0b0'11111'0000000000}; +template <> +inline constexpr BitComparisonWrapper + fraction_mask{0b0'00000'1111111111}; +#endif + +#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED +template <> +inline constexpr BitComparisonWrapper + exponent_mask{0b0'11111111'0000000}; +template <> +inline constexpr BitComparisonWrapper + fraction_mask{0b0'00000000'1111111}; +#endif + template class alignas(FloatType) floating_point_wrapper { public: - using impl_type = FloatType; + using impl_type = FloatType; + using bit_comparison_type = BitComparisonWrapper; private: impl_type val; @@ -269,6 +330,11 @@ class alignas(FloatType) floating_point_wrapper { #endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH } + KOKKOS_FUNCTION + floating_point_wrapper(bit_comparison_type rhs) { + val = Kokkos::bit_cast(rhs); + } + // Don't support implicit conversion back to impl_type. // impl_type is a storage only type on host. KOKKOS_FUNCTION diff --git a/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp b/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp index 55e0cf0c8ff..e6a5cadc67c 100644 --- a/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp +++ b/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp @@ -18,6 +18,7 @@ #define KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_ #include // For the float overloads +#include // bit_cast // clang-format off namespace Kokkos { @@ -74,7 +75,7 @@ namespace Kokkos { KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long) \ KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, long long) \ KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long long) - + #define KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF(FUNC, HALF_TYPE) \ KOKKOS_INLINE_FUNCTION bool FUNC(HALF_TYPE x) { \ @@ -155,10 +156,77 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, nextaf KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, copysign) // Classification and comparison functions // fpclassify -KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, isfinite) -KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, isinf) -#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP -KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, isnan) + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isfinite(Kokkos::Experimental::half_t x) { + using bit_type = Kokkos::Experimental::half_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return (bit_pattern_x.value & exponent_mask.value) != exponent_mask.value; +} +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isfinite(Kokkos::Experimental::bhalf_t x) { + using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return (bit_pattern_x.value & exponent_mask.value) != exponent_mask.value; +} +#endif + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isinf(Kokkos::Experimental::half_t x) { + using bit_type = Kokkos::Experimental::half_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) == 0)); +} +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isinf(Kokkos::Experimental::bhalf_t x) { + using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) == 0)); +} +#endif + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isnan(Kokkos::Experimental::half_t x) { + using bit_type = Kokkos::Experimental::half_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) != 0)); +} +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isnan(Kokkos::Experimental::bhalf_t x) { + using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) != 0)); +} #endif // isnormal KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, signbit) @@ -188,4 +256,4 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_COMPLEX_IMAG_HALF, imag) #undef KOKKOS_IMPL_MATH_H_FUNC_WRAPPER } // namespace Kokkos // clang-format on -#endif // KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_ \ No newline at end of file +#endif // KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_ diff --git a/core/src/impl/Kokkos_Half_NumericTraits.hpp b/core/src/impl/Kokkos_Half_NumericTraits.hpp index b5cbf22194c..9ccad45e977 100644 --- a/core/src/impl/Kokkos_Half_NumericTraits.hpp +++ b/core/src/impl/Kokkos_Half_NumericTraits.hpp @@ -70,7 +70,7 @@ /// template <> struct Kokkos::Experimental::Impl::infinity_helper { - static constexpr int value = 0x7C00; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'0000000000}; }; /// \brief: Minimum normalized number @@ -157,30 +157,30 @@ struct Kokkos::Experimental::Impl::norm_min_helper< /// \brief: Quiet not a half precision number /// -/// IEEE 754 defines this as all exponent bits high. +/// IEEE 754 defines this as all exponent bits and the first fraction bit high. /// /// Quiet NaN in binary16: /// [s e e e e e f f f f f f f f f f] -/// [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// [0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] /// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 template <> struct Kokkos::Experimental::Impl::quiet_NaN_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0xfc000; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'1000000000}; }; /// \brief: Signaling not a half precision number /// -/// IEEE 754 defines this as all exponent bits and the first fraction bit high. +/// IEEE 754 defines this as all exponent bits and the second fraction bit high. /// /// Quiet NaN in binary16: /// [s e e e e e f f f f f f f f f f] -/// [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] +/// [0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0] /// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 template <> struct Kokkos::Experimental::Impl::signaling_NaN_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0xfe000; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'0100000000}; }; /// \brief: Number of digits in the matissa that can be represented @@ -267,7 +267,7 @@ struct Kokkos::Experimental::Impl::max_exponent_helper< /// template <> struct Kokkos::Experimental::Impl::infinity_helper { - static constexpr int value = 0x7F80; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'0000000}; }; // Minimum normalized number @@ -303,13 +303,13 @@ struct Kokkos::Experimental::Impl::norm_min_helper< template <> struct Kokkos::Experimental::Impl::quiet_NaN_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0x7fc000; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'1000000}; }; // Signaling not a bhalf number template <> struct Kokkos::Experimental::Impl::signaling_NaN_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0x7fe000; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'0100000}; }; // Number of digits in the matissa that can be represented // without losing precision. diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 4d203ead75f..7d8450eb548 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -1546,9 +1546,163 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) { // TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279 -// TODO: TestIsFinite, see https://github.com/kokkos/kokkos/issues/6279 +template +struct TestIsFinite { + TestIsFinite() { run(); } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, 1), *this, errors); + ASSERT_EQ(errors, 0); + } + KOKKOS_FUNCTION void operator()(int, int& e) const { + using KE::infinity; + using KE::quiet_NaN; + using KE::signaling_NaN; + using Kokkos::isfinite; + if (!isfinite(1) || !isfinite(INT_MAX)) { + ++e; + Kokkos::printf("failed isfinite(integral)\n"); + } + if (!isfinite(2.f) || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value)) { + ++e; + Kokkos::printf("failed isfinite(float)\n"); + } + if (!isfinite(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isfinite(KE::half_t)\n"); + } + if (!isfinite(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isfinite(KE::bhalf_t)\n"); + } + if (!isfinite(3.) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isfinite(double)\n"); + } +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + if (!isfinite(4.l) || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value)) { + ++e; + Kokkos::printf("failed isfinite(long double)\n"); + } +#endif + // special values + if (isfinite(INFINITY) || isfinite(NAN)) { + ++e; + Kokkos::printf("failed isfinite(floating_point) special values\n"); + } + + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + static_assert(std::is_same::value); +#endif + } +}; + +TEST(TEST_CATEGORY, mathematical_functions_isfinite) { + TestIsFinite(); +} + +template +struct TestIsInf { + TestIsInf() { run(); } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, 1), *this, errors); + ASSERT_EQ(errors, 0); + } + KOKKOS_FUNCTION void operator()(int, int& e) const { + using KE::infinity; + using KE::quiet_NaN; + using KE::signaling_NaN; + using Kokkos::isinf; + if (isinf(1) || isinf(INT_MAX)) { + ++e; + Kokkos::printf("failed isinf(integral)\n"); + } + if (isinf(2.f) || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || !isinf(infinity::value)) { + ++e; + Kokkos::printf("failed isinf(float)\n"); + } + if (isinf(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || + !isinf(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isinf(KE::half_t)\n"); + } + if (isinf(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || + !isinf(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isinf(KE::bhalf_t)\n"); + } + if (isinf(3.) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || !isinf(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isinf(double)\n"); + } +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + if (isinf(4.l) || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || + !isinf(infinity::value)) { + ++e; + Kokkos::printf("failed isinf(long double)\n"); + } +#endif + // special values + if (!isinf(INFINITY) || isinf(NAN)) { + ++e; + Kokkos::printf("failed isinf(floating_point) special values\n"); + } -// TODO: TestIsInf, see https://github.com/kokkos/kokkos/issues/6279 + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + static_assert(std::is_same::value); +#endif + } +}; + +TEST(TEST_CATEGORY, mathematical_functions_isinf) { + TestIsInf(); +} template struct TestIsNaN { @@ -1559,6 +1713,7 @@ struct TestIsNaN { ASSERT_EQ(errors, 0); } KOKKOS_FUNCTION void operator()(int, int& e) const { + using KE::infinity; using KE::quiet_NaN; using KE::signaling_NaN; using Kokkos::isnan; @@ -1567,35 +1722,34 @@ struct TestIsNaN { Kokkos::printf("failed isnan(integral)\n"); } if (isnan(2.f) || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value)) { + !isnan(signaling_NaN::value) || isnan(infinity::value)) { ++e; Kokkos::printf("failed isnan(float)\n"); } -#if !defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP if (isnan(static_cast(2.f)) -#if !defined(KOKKOS_ENABLE_CUDA) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) + !isnan(signaling_NaN::value) || + isnan(infinity::value) #endif ) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); } if (isnan(static_cast(2.f)) -#if !defined(KOKKOS_ENABLE_CUDA) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) + !isnan(signaling_NaN::value) || + isnan(infinity::value) #endif ) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); } -#endif if (isnan(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) + !isnan(signaling_NaN::value) || isnan(infinity::value) #endif ) { ++e; @@ -1603,7 +1757,8 @@ struct TestIsNaN { } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (isnan(4.l) || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value)) { + !isnan(signaling_NaN::value) || + isnan(infinity::value)) { ++e; Kokkos::printf("failed isnan(long double)\n"); } diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 3c159ebb341..421eac022cc 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -101,8 +101,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(Infinity, int, int& e) const { using Kokkos::Experimental::infinity; - auto const inf = infinity::value; - auto const zero = T(0); + constexpr auto inf = infinity::value; + auto const zero = T(0); e += (int)!(inf + inf == inf); e += (int)!(inf != zero); use_on_device(); @@ -147,8 +147,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(QuietNaN, int, int& e) const { #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::quiet_NaN; - constexpr auto nan = quiet_NaN::value; - constexpr auto zero = T(0); + constexpr auto nan = quiet_NaN::value; + auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); #else @@ -159,8 +159,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(SignalingNaN, int, int& e) const { #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::signaling_NaN; - constexpr auto nan = signaling_NaN::value; - constexpr auto zero = T(0); + constexpr auto nan = signaling_NaN::value; + auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); #else @@ -204,6 +204,10 @@ struct TestNumericTraits< #endif TEST(TEST_CATEGORY, numeric_traits_infinity) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -387,6 +391,14 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { #endif } TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); From 61b93ec7fb8c426c275525bc1754d2a4f2741a92 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Tue, 14 Nov 2023 08:28:51 +0000 Subject: [PATCH 126/432] kokkos(unique): fix allocation of temporary view to enfore using the provided space instance --- algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp index 11afa8ed6e0..28635824585 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp @@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label, // using the same algorithm used for unique_copy but we now move things using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore); + tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing, + "std_unique_tmp_view"), + num_elements_to_explore); // scan extent is: num_elements_to_explore - 1 // for same reason as the one explained in unique_copy From 9c37437eaa361217f5af9201449900fa4ef37491 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 15 Nov 2023 10:57:50 -0700 Subject: [PATCH 127/432] Use binary wrapper for consistency in definition of half types numeric traits (#6590) * Use binary wrapper for consistency in definition of half types numeric traits finite_{min,max}, epsilon, norm_min, and round_error * Fix epsilon usage in TestHalfOperators.hpp * Enable more test in TestNumericTraits.hpp * Fix norm_min * Remove unimplemented features * Guard NumericTraits tests for NVHPC * Define epsilon explicitly for NVHPC and [b]half_t * Fix norm_min bhalf_t comment --------- Co-authored-by: Daniel Arndt --- core/src/impl/Kokkos_Half_NumericTraits.hpp | 20 ++++++++--------- core/unit_test/TestHalfOperators.hpp | 10 ++++----- core/unit_test/TestMathematicalFunctions.hpp | 18 +++++++++++++-- core/unit_test/TestNumericTraits.hpp | 23 ++++++++++++++++++++ 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/core/src/impl/Kokkos_Half_NumericTraits.hpp b/core/src/impl/Kokkos_Half_NumericTraits.hpp index 9ccad45e977..4779c2a6e10 100644 --- a/core/src/impl/Kokkos_Half_NumericTraits.hpp +++ b/core/src/impl/Kokkos_Half_NumericTraits.hpp @@ -87,7 +87,7 @@ struct Kokkos::Experimental::Impl::infinity_helper template <> struct Kokkos::Experimental::Impl::finite_min_helper< Kokkos::Experimental::half_t> { - static constexpr float value = -65504.0F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b1'11110'1111111111}; // -65504 }; /// \brief: Maximum normalized number @@ -104,7 +104,7 @@ struct Kokkos::Experimental::Impl::finite_min_helper< template <> struct Kokkos::Experimental::Impl::finite_max_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 65504.0F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11110'1111111111}; // +65504 }; /// \brief: This is the difference between 1 and the smallest floating point @@ -123,7 +123,7 @@ struct Kokkos::Experimental::Impl::finite_max_helper< template <> struct Kokkos::Experimental::Impl::epsilon_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0.0009765625F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00101'0000000000}; // 0.0009765625 }; /// @brief: The largest possible rounding error in ULPs @@ -134,7 +134,7 @@ struct Kokkos::Experimental::Impl::epsilon_helper< template <> struct Kokkos::Experimental::Impl::round_error_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0.5F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'01110'0000000000}; // 0.5 }; /// \brief: Minimum normalized positive half precision number @@ -152,7 +152,7 @@ struct Kokkos::Experimental::Impl::round_error_helper< template <> struct Kokkos::Experimental::Impl::norm_min_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0.00006103515625F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00001'0000000000}; // 0.00006103515625 }; /// \brief: Quiet not a half precision number @@ -274,30 +274,30 @@ struct Kokkos::Experimental::Impl::infinity_helper struct Kokkos::Experimental::Impl::finite_min_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = -3.38953139e38; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b1'11111110'1111111}; // -3.38953139e38 }; // Maximum normalized number template <> struct Kokkos::Experimental::Impl::finite_max_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 3.38953139e38; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111110'1111111}; // +3.38953139e3 }; // 1/2^7 template <> struct Kokkos::Experimental::Impl::epsilon_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0.0078125F; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'01111000'0000000}; // 0.0078125 }; template <> struct Kokkos::Experimental::Impl::round_error_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0.5F; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'01111110'0000000}; // 0.5 }; // Minimum normalized positive bhalf number template <> struct Kokkos::Experimental::Impl::norm_min_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 1.1754494351e-38; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'00000001'0000000}; // 1.175494351e-38 }; // Quiet not a bhalf number template <> diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp index bf7013cf738..752e3b50816 100644 --- a/core/unit_test/TestHalfOperators.hpp +++ b/core/unit_test/TestHalfOperators.hpp @@ -975,7 +975,7 @@ struct Functor_TestHalfOperators { template void __test_half_operators(half_type h_lhs, half_type h_rhs) { - double epsilon = Kokkos::Experimental::epsilon::value; + half_type epsilon = Kokkos::Experimental::epsilon::value; Functor_TestHalfOperators f_device(h_lhs, h_rhs); Functor_TestHalfOperators f_host(h_lhs, h_rhs); @@ -990,9 +990,9 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { // printf("op_test = %d\n", op_test); ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - epsilon); + static_cast(epsilon)); ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - epsilon); + static_cast(epsilon)); } // volatile-qualified parameter type 'volatile half_type' is deprecated @@ -1015,9 +1015,9 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H || op_test == CDIV_H_H) { ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - epsilon); + static_cast(epsilon)); ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - epsilon); + static_cast(epsilon)); } } #endif diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 7d8450eb548..be479e0219a 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -240,11 +240,25 @@ struct FloatingPointComparison { } #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT KOKKOS_FUNCTION - KE::half_t eps(KE::half_t) const { return KE::epsilon::value; } + KE::half_t eps(KE::half_t) const { +// FIXME_NVHPC compile-time error +#ifdef KOKKOS_COMPILER_NVHPC + return 0.0009765625F; +#else + return KE::epsilon::value; +#endif + } #endif #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT KOKKOS_FUNCTION - KE::bhalf_t eps(KE::bhalf_t) const { return KE::epsilon::value; } + KE::bhalf_t eps(KE::bhalf_t) const { +// FIXME_NVHPC compile-time error +#ifdef KOKKOS_COMPILER_NVHPC + return 0.0078125; +#else + return KE::epsilon::value; +#endif + } #endif KOKKOS_FUNCTION double eps(float) const { return FLT_EPSILON; } diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 421eac022cc..f6fdc8376fb 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -218,6 +218,10 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) { } TEST(TEST_CATEGORY, numeric_traits_epsilon) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -228,6 +232,11 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) { } TEST(TEST_CATEGORY, numeric_traits_round_error) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -238,6 +247,10 @@ TEST(TEST_CATEGORY, numeric_traits_round_error) { } TEST(TEST_CATEGORY, numeric_traits_norm_min) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -309,6 +322,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); #if !defined(KOKKOS_ENABLE_CUDA) || \ @@ -330,6 +345,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); #if !defined(KOKKOS_ENABLE_CUDA) || \ @@ -359,6 +376,8 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); #if !defined(KOKKOS_ENABLE_CUDA) || \ @@ -368,6 +387,10 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { } TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); From c60716df432ee5873886ef81505147bbe0072663 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 16 Nov 2023 09:20:14 +0100 Subject: [PATCH 128/432] try fix --- core/unit_test/TestDeviceAndThreads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 1d3ff8eea7e..511b182cab0 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -30,7 +30,7 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 3, 5, 7]: + for x in [1, 2, 4, 6, 8]: if x >= max_threads: break yield x From ee655c08ad2188474f82930e20083ace0383e625 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 16 Nov 2023 19:42:34 +0000 Subject: [PATCH 129/432] Fix TestNumericTriats.hpp for SYCL with bfloat16 support --- core/unit_test/TestNumericTraits.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index f6fdc8376fb..f197a2d8891 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -110,8 +110,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const { using Kokkos::Experimental::epsilon; - auto const eps = epsilon::value; - auto const one = T(1); + T const eps = epsilon::value; + T const one = 1; // Avoid higher precision intermediate representation compare() = one + eps; e += (int)!(compare() != one); From 1a145311ff8f002fa526608410c8e3685e908353 Mon Sep 17 00:00:00 2001 From: Pierre Kestener Date: Thu, 16 Nov 2023 16:57:16 +0100 Subject: [PATCH 130/432] Fix generated Makefile when using gnu_generate_makefile.sh and make >= 4.3 --- algorithms/unit_tests/Makefile | 14 +++++++------- containers/unit_tests/Makefile | 4 ++-- core/unit_test/Makefile | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/algorithms/unit_tests/Makefile b/algorithms/unit_tests/Makefile index 601217799a8..d3946c149ba 100644 --- a/algorithms/unit_tests/Makefile +++ b/algorithms/unit_tests/Makefile @@ -27,13 +27,13 @@ TARGETS = tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " > Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ ) \ ) diff --git a/containers/unit_tests/Makefile b/containers/unit_tests/Makefile index 2e35832cc89..18410882bca 100644 --- a/containers/unit_tests/Makefile +++ b/containers/unit_tests/Makefile @@ -35,8 +35,8 @@ TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynV tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" >> Test$(device)_$(test).cpp); \ )\ ) \ ) diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index 33a84b61f92..202809d3fc9 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -67,8 +67,8 @@ TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longi tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ ) \ ) @@ -82,8 +82,8 @@ KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST)) tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ )\ ) @@ -91,8 +91,8 @@ tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > TestCuda_$(test).cpp); \ - $(shell echo "\#include " >> TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " > TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " >> TestCuda_$(test).cpp); \ )\ ) @@ -100,8 +100,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) @@ -277,8 +277,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) From 8fd8c94aa553ec8e355171c00d0ece65be530ac8 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Fri, 17 Nov 2023 22:10:41 +0100 Subject: [PATCH 131/432] Threads: add missing broadcast to TeamThreadRange parallel_scan (#6601) * try * use reference --- core/src/Threads/Kokkos_Threads_Team.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index b811a7944ba..8f9614233b2 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -1001,8 +1001,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } + auto & team_member = loop_bounds.thread; + // 'scan_val' output is the exclusive prefix sum - scan_val = loop_bounds.thread.team_scan(scan_val); + scan_val = team_member.team_scan(scan_val); #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -1012,6 +1014,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, true); } + team_member.team_broadcast(scan_val, team_member.team_size() - 1); + return_val = scan_val; } From 932c1fb2f059ba1cbd84e0c191459cc17ee8dc23 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Mon, 13 Nov 2023 18:15:02 -0700 Subject: [PATCH 132/432] Added missing operator* to NEON simd --- simd/src/Kokkos_SIMD_NEON.hpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 43ece203890..61d506eac86 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -868,7 +868,11 @@ class simd> { return simd( vadd_s32(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmul_s32(static_cast(lhs), static_cast(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1068,7 +1072,10 @@ class simd> { return simd( vaddq_s64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1261,7 +1268,10 @@ class simd> { return simd( vaddq_u64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( simd const& lhs, simd const& rhs) noexcept { return simd( From ff7104cee13d01174ff896682e1aab7333934aff Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 17 Nov 2023 14:19:24 -0700 Subject: [PATCH 133/432] [ci skip] Update changelog on develop for 4.2.00 (#6592) * [ci skip] Update changelog on develop for 4.2.00 * [ci skip] Fix whitespace --- CHANGELOG.md | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0534d741f3..92bb6fdbe5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,97 @@ # CHANGELOG -## [4.1.00](https://github.com/kokkos/kokkos/tree/4.0.01) (2023-06-16) +## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00) + +### Features: +- SIMD: significant improvements to SIMD support and alignment with C++26 SIMD + - add `Kokkos::abs` overload for SIMD types [\#6069](https://github.com/kokkos/kokkos/pull/6069) + - add generator constructors [\#6347](https://github.com/kokkos/kokkos/pull/6347) + - convert binary operators to hidden friends [\#6320](https://github.com/kokkos/kokkos/pull/6320) + - add shift operators [\#6109](https://github.com/kokkos/kokkos/pull/6109) + - add `float` support [\#6177](https://github.com/kokkos/kokkos/pull/6177) + - add remaining `gather_from` and `scatter_to` overloads [\#6220](https://github.com/kokkos/kokkos/pull/6220) + - define simd math function overloads in the Kokkos namespace [\#6465](https://github.com/kokkos/kokkos/pull/6465), [\#6487](https://github.com/kokkos/kokkos/pull/6487) + - `Kokkos_ENABLE_NATIVE=ON` autodetects SIMD types supported [\#6188](https://github.com/kokkos/kokkos/pull/6188) + - fix AVX2 SIMD support for ZEN2 AMD CPU [\#6238](https://github.com/kokkos/kokkos/pull/6238) +- `Kokkos::printf` [\#6083](https://github.com/kokkos/kokkos/pull/6083) +- `Kokkos::sort`: support custom comparator [\#6253](https://github.com/kokkos/kokkos/pull/6253) +- `half_t` and `bhalf_t` numeric traits [\#5778](https://github.com/kokkos/kokkos/pull/5778) +- `half_t` and `bhalf_t` mixed comparisons [\#6407](https://github.com/kokkos/kokkos/pull/6407) +- `half_t` and `bhalf_t` mathematical functions [\#6124](https://github.com/kokkos/kokkos/pull/6124) +- `TeamThreadRange` `parallel_scan` with return value [\#6090](https://github.com/kokkos/kokkos/pull/6090), [\#6301](https://github.com/kokkos/kokkos/pull/6301), [\#6302](https://github.com/kokkos/kokkos/pull/6302), [\#6303](https://github.com/kokkos/kokkos/pull/6303), [\#6307](https://github.com/kokkos/kokkos/pull/6307) +- `ThreadVectorRange` `parallel_scan` with return value [\#6235](https://github.com/kokkos/kokkos/pull/6235), [\#6242](https://github.com/kokkos/kokkos/pull/6242), [\#6308](https://github.com/kokkos/kokkos/pull/6308), [\#6305](https://github.com/kokkos/kokkos/pull/6305), [\#6292](https://github.com/kokkos/kokkos/pull/6292) +- Add team-level std algorithms [\#6200](https://github.com/kokkos/kokkos/pull/6200), [\#6205](https://github.com/kokkos/kokkos/pull/6205), [\#6207](https://github.com/kokkos/kokkos/pull/6207), [\#6208](https://github.com/kokkos/kokkos/pull/6208), [\#6209](https://github.com/kokkos/kokkos/pull/6209), [\#6210](https://github.com/kokkos/kokkos/pull/6210), [\#6211](https://github.com/kokkos/kokkos/pull/6211), [\#6212](https://github.com/kokkos/kokkos/pull/6212), [\#6213](https://github.com/kokkos/kokkos/pull/6213), [\#6256](https://github.com/kokkos/kokkos/pull/6256), [\#6258](https://github.com/kokkos/kokkos/pull/6258), [\#6350](https://github.com/kokkos/kokkos/pull/6350), [\#6351](https://github.com/kokkos/kokkos/pull/6351) +- Serial: Allow for distinct execution space instances [\#6441](https://github.com/kokkos/kokkos/pull/6441) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Fixed potential data race in Cuda `parallel_reduce` [\#6236](https://github.com/kokkos/kokkos/pull/6236) +- Use `cudaMallocAsync` by default [\#6402](https://github.com/kokkos/kokkos/pull/6402) +- Bugfix for using Kokkos from a thread of execution [\#6299](https://github.com/kokkos/kokkos/pull/6299) + +#### HIP: +- New naming convention for AMD GPU: VEGA906, VEGA908, VEGA90A, NAVI1030 to AMD_GFX906, AMD_GFX908, AMD_GFX90A, AMD_GFX1030 [\#6266](https://github.com/kokkos/kokkos/pull/6266) +- Add initial support for gfx942: [\#6358](https://github.com/kokkos/kokkos/pull/6358) +- Improve reduction performance [\#6229](https://github.com/kokkos/kokkos/pull/6229) +- Deprecate `HIP(hipStream_t,bool)` constructor [\#6401](https://github.com/kokkos/kokkos/pull/6401) +- Add support for Graph [\#6370](https://github.com/kokkos/kokkos/pull/6370) +- Improve reduction performance when using Teams [\#6284](https://github.com/kokkos/kokkos/pull/6284) +- Fix concurrency calculation [\#6479](https://github.com/kokkos/kokkos/pull/6479) +- Fix potential data race in HIP `parallel_reduce` [\#6429](https://github.com/kokkos/kokkos/pull/6429) + +#### SYCL: +- Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246) +- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) +- Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321) +- Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322) +- Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189) + +#### OpenACC: +- Support Clacc compiler [\#6250](https://github.com/kokkos/kokkos/pull/6250) + +### General Enhancements +- Add missing `is_*_view` traits and `is_*_view_v` helper variable templates for `DynRankView`, `DynamicView`, `OffsetView`, `ScatterView` containers [\#6195](https://github.com/kokkos/kokkos/pull/6195) +- Make `nvcc_wrapper` and `compiler_launcher` scripts more portable by switching to a `#!/usr/bin/env` shebang [\#6357](https://github.com/kokkos/kokkos/pull/6357) +- Add an improved `Kokkos::malloc` / `Kokkos::free` performance test [\#6377](https://github.com/kokkos/kokkos/pull/6377) +- Ensure `Views` with `size==0` can be used with `deep_copy` [\#6273](https://github.com/kokkos/kokkos/pull/6273) +- `Kokkos::abort` is moved to header `Kokkos_Abort.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445) +- `KOKKOS_ASSERT`, `KOKKOS_EXPECTS`, `KOKKOS_ENSURES` are moved to header `Kokkos_Assert.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445) +- Add a permuted-index mode to the gups benchmark [\#6378](https://github.com/kokkos/kokkos/pull/6378) +- Check for overflow during backend initialization [\#6159](https://github.com/kokkos/kokkos/pull/6159) +- Make constraints on `Kokkos::sort` more visible [\#6234](https://github.com/kokkos/kokkos/pull/6234) and cleanup API [\#6239](https://github.com/kokkos/kokkos/pull/6239) +- Add converting assignment to `DualView`: [\#6474](https://github.com/kokkos/kokkos/pull/6474) + + +### Build System Changes + +- Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282) +- Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342) + +### Incompatibilities (i.e. breaking changes) + - Ensure that `Kokkos::complex` only gets instantiated for cv-unqualified floating-point types [\#6251](https://github.com/kokkos/kokkos/pull/6251) + - Removed (deprecated-3) support for volatile join operators in reductions [\#6385](https://github.com/kokkos/kokkos/pull/6385) + - Enforce `ViewCtorArgs` restrictions for `create_mirror_view` [\#6304](https://github.com/kokkos/kokkos/pull/6304) + - SIMD types for ARM NEON are not autodetected anymore but need `Kokkos_ARCH_ARM_NEON` or `Kokkos_ARCH_NATIVE=ON` [\#6394](https://github.com/kokkos/kokkos/pull/6394) + - Remove `#include ` from headers where possible [\#6482](https://github.com/kokkos/kokkos/pull/6482) + +### Deprecations +- Deprecated `Kokkos::vector` [\#6252](https://github.com/kokkos/kokkos/pull/6252) +- All host allocation mechanisms except for `STD_MALLOC` have been deprecated [\#6341](https://github.com/kokkos/kokkos/pull/6341) + +### Bug Fixes + - Missing memory fence in `RandomPool::free_state` functions [\#6290](https://github.com/kokkos/kokkos/pull/6290) + - Fix for corner case in `Kokkos::Experimental::is_partitioned` algorithm [\#6257](https://github.com/kokkos/kokkos/pull/6257) + - Fix initialization of scratch lock variables in the `Cuda` backend [\#6433](https://github.com/kokkos/kokkos/pull/6433) + - Fixes for `Kokkos::Array` [\#6372](https://github.com/kokkos/kokkos/pull/6372) + - Fixed symlink configure issue for Windows [\#6241](https://github.com/kokkos/kokkos/pull/6241) + - OpenMPTarget init-join fix [\#6444](https://github.com/kokkos/kokkos/pull/6444) + - Fix atomic operations bug for Min and Max [\#6435](https://github.com/kokkos/kokkos/pull/6435) + - Fix implementation for `cyl_bessel_i0` [\#6484](https://github.com/kokkos/kokkos/pull/6484) + - Fix various NVCC warnings in `BinSort`, `Array`, and bit manipulation function templates [\#6483](https://github.com/kokkos/kokkos/pull/6483) + +## [4.1.00](https://github.com/kokkos/kokkos/tree/4.1.00) (2023-06-16) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.01...4.1.00) ### Features: From 81a9586539f4d71db4d6219aca8c2ca613be7ddf Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 17 Nov 2023 14:27:19 -0700 Subject: [PATCH 134/432] Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF (#6593) * Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF * Clean up tutorials --- core/src/Kokkos_Macros.hpp | 6 --- core/src/setup/Kokkos_Setup_SYCL.hpp | 8 ---- .../unit_test/TestBitManipulationBuiltins.hpp | 14 +++--- core/unit_test/TestMathematicalFunctions.hpp | 46 +++++++++---------- .../tutorial/01_hello_world/hello_world.cpp | 7 +-- .../hello_world_lambda.cpp | 10 ++-- .../01_thread_teams/thread_teams.cpp | 10 ++-- .../thread_teams_lambda.cpp | 16 +++---- .../nested_parallel_for.cpp | 15 ++---- 9 files changed, 47 insertions(+), 85 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 3f53fcba683..a77e50b65b3 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -339,12 +339,6 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif -// Temporary solution for SYCL not supporting printf in kernels. -// Might disappear at any point once we have found another solution. -#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF) -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__) -#endif - //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 7f7957bc61f..72017e38d88 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -38,12 +38,4 @@ #include #endif -#ifdef __SYCL_DEVICE_ONLY__ -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...) \ - do { \ - const __attribute__((opencl_constant)) char fmt[] = (format); \ - sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__); \ - } while (0) -#endif - #endif diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 092e7cff618..2f3bcfe817d 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -804,26 +804,26 @@ struct TestBitCastFunction { using Kokkos::bit_cast; if (bit_cast(123) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n"); + Kokkos::printf("failed check #1\n"); } if (bit_cast(123u) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n"); + Kokkos::printf("failed check #2\n"); } if (bit_cast(~0u) != ~0) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n"); + Kokkos::printf("failed check #3\n"); } if constexpr (sizeof(int) == sizeof(float)) { if (!check(12.34f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n"); + Kokkos::printf("failed check #4\n"); } } if constexpr (sizeof(unsigned long long) == sizeof(double)) { if (!check(123.456)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n"); + Kokkos::printf("failed check #5\n"); } } @@ -848,11 +848,11 @@ struct TestBitCastFunction { } if (!(bit_cast(arr) == arr)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n"); + Kokkos::printf("failed check #6\n"); } if (!(bit_cast(arr2) == arr2)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n"); + Kokkos::printf("failed check #7\n"); } } }; diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index be479e0219a..3150a015b5d 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -1304,12 +1304,12 @@ struct TestAbsoluteValueFunction { if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n"); + Kokkos::printf("failed abs(KE::half_t)\n"); } if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n"); + Kokkos::printf("failed abs(KE::bhalf_t)\n"); } if (abs(5.) != 5. || abs(-5.) != 5.) { ++e; @@ -1360,26 +1360,26 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::fabs; if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n"); + Kokkos::printf("failed fabs(float)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n"); + Kokkos::printf("failed fabs(KE::half_t)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n"); + Kokkos::printf("failed fabs(KE::bhalf_t)\n"); } if (fabs(5.) != 5. || fabs(-5.) != 5.) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n"); + Kokkos::printf("failed fabs(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n"); + Kokkos::printf("failed fabs(long double)\n"); } #endif // special values @@ -1387,8 +1387,7 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::isnan; if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fabs(floating_point) special values\n"); + Kokkos::printf("failed fabs(floating_point) special values\n"); } static_assert(std::is_same(4.f))), @@ -1420,7 +1419,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(fmod(6.2f, 4.f), 2.2f, 1) && !compare(fmod(-6.2f, 4.f), -2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n"); + Kokkos::printf("failed fmod(float)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1429,7 +1428,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { fmod(static_cast(-6.2f), static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n"); + Kokkos::printf("failed fmod(KE::half_t)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1438,17 +1437,17 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n"); + Kokkos::printf("failed fmod(KE::bhalf_t)\n"); } if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n"); + Kokkos::printf("failed fmod(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(fmod(6.2l, 4.l), 2.2l, 1) && !compare(fmod(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n"); + Kokkos::printf("failed fmod(long double)\n"); } #endif // special values @@ -1457,8 +1456,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(fmod(-KE::infinity::value, 1.f)) && !isnan(fmod(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fmod(floating_point) special values\n"); + Kokkos::printf("failed fmod(floating_point) special values\n"); } static_assert(std::is_same(4.f), @@ -1494,7 +1492,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(remainder(6.2f, 4.f), 2.2f, 2) && !compare(remainder(-6.2f, 4.f), 2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n"); + Kokkos::printf("failed remainder(float)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1503,7 +1501,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n"); + Kokkos::printf("failed remainder(KE::half_t)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1512,18 +1510,18 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n"); + Kokkos::printf("failed remainder(KE::bhalf_t)\n"); } if (!compare(remainder(6.2, 4.), 2.2, 2) && !compare(remainder(-6.2, 4.), 2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n"); + Kokkos::printf("failed remainder(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(remainder(6.2l, 4.l), 2.2l, 1) && !compare(remainder(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n"); + Kokkos::printf("failed remainder(long double)\n"); } #endif // special values @@ -1532,7 +1530,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(remainder(-KE::infinity::value, 1.f)) && !isnan(remainder(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( + Kokkos::printf( "failed remainder(floating_point) special values\n"); } @@ -1748,7 +1746,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); + Kokkos::printf("failed isnan(KE::half_t)\n"); } if (isnan(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1758,7 +1756,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); + Kokkos::printf("failed isnan(KE::bhalf_t)\n"); } if (isnan(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 diff --git a/example/tutorial/01_hello_world/hello_world.cpp b/example/tutorial/01_hello_world/hello_world.cpp index 5b8a21af833..22b8b6d63c8 100644 --- a/example/tutorial/01_hello_world/hello_world.cpp +++ b/example/tutorial/01_hello_world/hello_world.cpp @@ -58,12 +58,7 @@ struct hello_world { // is unnecessary but harmless. KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + Kokkos::printf("Hello from i = %i\n", i); } }; diff --git a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp index c78f3076361..909765e1fc3 100644 --- a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp +++ b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -76,13 +76,9 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( 15, KOKKOS_LAMBDA(const int i) { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - // printf works in a CUDA parallel kernel; std::ostream does not. - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + // Kokko::printf works for all backends in a parallel kernel; + // std::ostream does not. + Kokkos::printf("Hello from i = %i\n", i); }); #endif // You must call finalize() after you are done using Kokkos. diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp index b041f8d435b..ee3f4721d91 100644 --- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp +++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp @@ -47,13 +47,9 @@ struct hello_world { // The TeamPolicy<>::member_type provides functions to query the multi // dimensional index of a thread as well as the number of thread-teams and // the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); } }; diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp index 933b254f7c7..1e6812adead 100644 --- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp +++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -57,16 +57,12 @@ int main(int narg, char* args[]) { policy, KOKKOS_LAMBDA(const team_member& thread, int& lsum) { lsum += 1; - // TeamPolicy<>::member_type provides functions to query the - // multidimensional index of a thread, as well as the number of - // thread teams and the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs workaround for printf - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + // TeamPolicy<>::member_type provides functions to query the + // multidimensional index of a thread, as well as the number of + // thread teams and the size of each team. + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); }, sum); #endif diff --git a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp index 398810d1331..75d6089e9af 100644 --- a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp +++ b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp @@ -43,16 +43,11 @@ struct hello_world { // the operator using a team_policy acts like a parallel region for the // team. That means that everything outside of the nested parallel_for is // also executed by all threads of the team. - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31), - [&](const int& i) { -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: (%i , %i) executed loop %i \n", - thread.league_rank(), thread.team_rank(), i); -#else - (void) i; -#endif - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, 31), [&](const int& i) { + Kokkos::printf("Hello World: (%i , %i) executed loop %i \n", + thread.league_rank(), thread.team_rank(), i); + }); } }; From f0af4672cabb5f7a13293bb10b8b68c5191ff735 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 08:19:29 +0100 Subject: [PATCH 135/432] try fix --- core/unit_test/TestDeviceAndThreads.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 511b182cab0..f61c850ffd9 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,6 +17,7 @@ import unittest import subprocess +import psutil PREFIX = "$" EXECUTABLE = "$" @@ -30,7 +31,9 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 4, 6, 8]: + phys_cores_count = psutil.cpu_count(logical=False) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] + for x in looplist: if x >= max_threads: break yield x From 2779b29b556948af548eb1eae6952969e0c81e17 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 09:08:02 +0100 Subject: [PATCH 136/432] avoid pyt package --- core/unit_test/TestDeviceAndThreads.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index f61c850ffd9..e246ebba9ff 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,7 +17,7 @@ import unittest import subprocess -import psutil +import os #psutil PREFIX = "$" EXECUTABLE = "$" @@ -31,7 +31,16 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - phys_cores_count = psutil.cpu_count(logical=False) + #phys_cores_count = psutil.cpu_count(logical=False) + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + if os.name == 'nt': + args = ['wmic', 'cpu', 'get', 'NumberOfCores'] + + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + phys_cores_count = int(output) + print(phys_cores_count) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] for x in looplist: if x >= max_threads: From 17af2f3c40c213958fce992ca7afb08b9ab4d963 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 09:49:07 +0100 Subject: [PATCH 137/432] try --- core/unit_test/TestDeviceAndThreads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index e246ebba9ff..538cd9c3707 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -32,7 +32,7 @@ def GetFlag(flag, *extra_args): def GetNumThreads(max_threads): #phys_cores_count = psutil.cpu_count(logical=False) - args = ['sysctl', '-n', 'hw.physicalcpu_max'] + args = ['nproc', '--all'] #'sysctl', '-n', 'hw.physicalcpu_max'] if os.name == 'nt': args = ['wmic', 'cpu', 'get', 'NumberOfCores'] From 68e4bedc43cdb7ce1419b3da21fb84e797a7a295 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 12:28:03 +0100 Subject: [PATCH 138/432] fix for macos --- core/unit_test/TestDeviceAndThreads.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 538cd9c3707..7624b8394b5 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,7 +17,7 @@ import unittest import subprocess -import os #psutil +import platform #psutil PREFIX = "$" EXECUTABLE = "$" @@ -32,15 +32,18 @@ def GetFlag(flag, *extra_args): def GetNumThreads(max_threads): #phys_cores_count = psutil.cpu_count(logical=False) - args = ['nproc', '--all'] #'sysctl', '-n', 'hw.physicalcpu_max'] - if os.name == 'nt': + args = [] + name = platform.system() + if name == 'Darwin': + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + elif name == 'Linux': + args = ['nproc', '--all'] + else: args = ['wmic', 'cpu', 'get', 'NumberOfCores'] result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = result.stdout.decode('utf-8') phys_cores_count = int(output) - print(phys_cores_count) - looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] for x in looplist: if x >= max_threads: From 61842b7d104402ce09a2ebd59e4657237425032b Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 12:32:25 +0100 Subject: [PATCH 139/432] remove comments --- core/unit_test/TestDeviceAndThreads.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 7624b8394b5..d44af1ea3d1 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,7 +17,7 @@ import unittest import subprocess -import platform #psutil +import platform PREFIX = "$" EXECUTABLE = "$" @@ -31,7 +31,6 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - #phys_cores_count = psutil.cpu_count(logical=False) args = [] name = platform.system() if name == 'Darwin': From 33a1106da121e31a8ee84bcfaf3ccaa69556370f Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 16 Nov 2023 07:52:22 +0100 Subject: [PATCH 140/432] use reference --- core/src/impl/Kokkos_HostThreadTeam.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 51f25a8b60f..25bf5921fcf 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto team_member = loop_boundaries.thread; + auto & team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); From 374064ab75aa4fd1727781b01c8c1724470788ac Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sun, 19 Nov 2023 07:55:54 +0100 Subject: [PATCH 141/432] add branching --- core/unit_test/TestDeviceAndThreads.py | 4 +++- core/unit_test/UnitTest_DeviceAndThreads.cpp | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index d44af1ea3d1..95727dad85c 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -43,7 +43,9 @@ def GetNumThreads(max_threads): result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = result.stdout.decode('utf-8') phys_cores_count = int(output) - looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] \ + if GetFlag("hwloc_enabled") else [1,2,3,4,5] + for x in looplist: if x >= max_threads: break diff --git a/core/unit_test/UnitTest_DeviceAndThreads.cpp b/core/unit_test/UnitTest_DeviceAndThreads.cpp index b522ac3e69b..ea944bae4cd 100644 --- a/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -68,6 +68,14 @@ int get_max_threads() { #endif } +int get_hwloc_enabled() { +#ifdef KOKKOS_ENABLE_HWLOC + return 1; +#else + return 0; +#endif +} + int get_num_threads() { int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency(); assert(num_threads == Kokkos::num_threads()); @@ -93,6 +101,7 @@ int print_flag(std::string const& flag) { KOKKOS_TEST_PRINT_FLAG(device_count); KOKKOS_TEST_PRINT_FLAG(disable_warnings); KOKKOS_TEST_PRINT_FLAG(tune_internals); + KOKKOS_TEST_PRINT_FLAG(hwloc_enabled); #undef KOKKOS_TEST_PRINT_FLAG From 3dd0b825380048e378589546949d101263bf1f72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Nov 2023 12:18:00 +0100 Subject: [PATCH 142/432] [ci skip] fix formatting --- core/src/Threads/Kokkos_Threads_Team.hpp | 2 +- core/src/impl/Kokkos_HostThreadTeam.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index 8f9614233b2..fd0f221365b 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -1001,7 +1001,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } - auto & team_member = loop_bounds.thread; + auto& team_member = loop_bounds.thread; // 'scan_val' output is the exclusive prefix sum scan_val = team_member.team_scan(scan_val); diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 25bf5921fcf..25f09b82865 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto & team_member = loop_boundaries.thread; + auto& team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); From ae75d38951e2cebb3537649bd3e7ca046d285148 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 Nov 2023 15:13:38 -0500 Subject: [PATCH 143/432] GitHub Workflows: Use Ubuntu 22.04 instead of Fedora for Intel compiler testing --- .github/workflows/continuous-integration-workflow.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 8c226c3766c..6446cbacd9b 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -25,22 +25,22 @@ jobs: backend: ['OPENMP'] clang-tidy: [''] include: - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Debug' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Debug' From 0262f7405e06b409df1c3a6ba0c8901ae7387110 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Tue, 21 Nov 2023 08:56:15 +0000 Subject: [PATCH 144/432] nvcc(wrapper): adding missing `--generate-line-info` arg --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index c1400872402..1b0fda12207 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|-v|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args From a4720ce414d8b92e7ed59ec0b76dc7f1a32df176 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 Nov 2023 09:16:26 -0500 Subject: [PATCH 145/432] Add clang-format check to GitHub workflows (#6612) * Add clang-format check to GitHub workflows * Use DoozyX/clang-format-lint-action * Fix capitalization; remove 'exclude' --- .github/workflows/clang-format-check.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/workflows/clang-format-check.yml diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml new file mode 100644 index 00000000000..1f557dbfcdf --- /dev/null +++ b/.github/workflows/clang-format-check.yml @@ -0,0 +1,11 @@ +name: clang-format check +on: [push, pull_request] +jobs: + formatting-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run clang-format style check. + uses: DoozyX/clang-format-lint-action@v0.16.2 + with: + clangFormatVersion: 8 From f31436a0937ba7e21acce3049d1fac2550079140 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Mon, 27 Nov 2023 13:46:30 +0000 Subject: [PATCH 146/432] graph(HIP): adding inline keyword to fix #6623 --- core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index 3bde15444c7..7cc06d02fbe 100644 --- a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -83,7 +83,7 @@ class GraphImpl { hipGraphExec_t m_graph_exec = nullptr; }; -GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(m_graph); if (m_graph_exec) { @@ -92,12 +92,12 @@ GraphImpl::~GraphImpl() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph)); } -GraphImpl::GraphImpl(Kokkos::HIP instance) +inline GraphImpl::GraphImpl(Kokkos::HIP instance) : m_execution_space(std::move(instance)) { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0)); } -void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node @@ -110,7 +110,7 @@ void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in it's policy template -void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -129,8 +129,8 @@ void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -void GraphImpl::add_predecessor(NodeImplPtr arg_node_ptr, - PredecessorRef arg_pred_ref) { +inline void GraphImpl::add_predecessor( + NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); KOKKOS_EXPECTS(pred_ptr); @@ -145,7 +145,7 @@ void GraphImpl::add_predecessor(NodeImplPtr arg_node_ptr, hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -void GraphImpl::submit() { +inline void GraphImpl::submit() { if (!m_graph_exec) { instantiate_graph(); } @@ -153,12 +153,12 @@ void GraphImpl::submit() { hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); } -Kokkos::HIP const& GraphImpl::get_execution_space() const +inline Kokkos::HIP const& GraphImpl::get_execution_space() const noexcept { return m_execution_space; } -auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(m_graph); KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), @@ -172,7 +172,7 @@ auto GraphImpl::create_root_node_ptr() { } template -auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { +inline auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for // each predecessor ref, so all we need to do here is create the (trivial) From 16972af285540e4a01f39a2e0520fe29f437092b Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 27 Nov 2023 10:19:27 -0500 Subject: [PATCH 147/432] Add jenkins multibranch pipeline options --- .jenkins | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.jenkins b/.jenkins index f5d0bfcf0e8..2fa3a68f8b4 100644 --- a/.jenkins +++ b/.jenkins @@ -8,9 +8,14 @@ pipeline { } options { + disableConcurrentBuilds(abortPrevious: true) timeout(time: 6, unit: 'HOURS') } + triggers { + issueCommentTrigger('.*test this please.*') + } + stages { stage('Clang-Format') { agent { From 4d4a343e50bf4d5998efe20f97e8670c93bdf2dd Mon Sep 17 00:00:00 2001 From: tcclevenger Date: Tue, 28 Nov 2023 09:26:25 -0700 Subject: [PATCH 148/432] Add warp sync for Cuda parallel reduce compute-sanitizer --tool=racecheck discovered a potential racecondition for Cuda parallel reductions (using range policy) where data was being updated on a single thread inside a warp, but the warp was not being synchronized before being read. --- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index fcbd75c57f9..64a97e283b7 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -243,6 +243,12 @@ class ParallelReduce, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); + } else if (word_count.value > 1) { + // Inside cuda_single_inter_block_reduce_scan() above, shared[i] below + // might have been updated by a single thread within a warp without + // synchronization afterwards. Synchronize threads within warp to avoid + // potential racecondition. + __syncwarp(0xffffffff); } for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { From c9d7bbad13b42abfa3cd458196f395ff19a0c9ef Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Wed, 29 Nov 2023 16:00:59 +0000 Subject: [PATCH 149/432] kokkos(profiling): do not finalize in any backend `Kokkos::Profiling::finalize` should only be called in `pre_finalize_internal`, as described in #6633. --- core/src/OpenMP/Kokkos_OpenMP_Instance.cpp | 2 -- core/src/Serial/Kokkos_Serial.cpp | 2 -- core/src/Threads/Kokkos_Threads_Instance.cpp | 2 -- 3 files changed, 6 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 3038345b269..32172fbc6c7 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -304,8 +304,6 @@ void OpenMPInternal::finalize() { } m_initialized = false; - - Kokkos::Profiling::finalize(); } void OpenMPInternal::print_configuration(std::ostream &s) const { diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index e81e8349391..071ecdbc4fa 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -58,8 +58,6 @@ void SerialInternal::finalize() { m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0); } - Kokkos::Profiling::finalize(); - m_is_initialized = false; } diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index f5a97000000..9e7b4222aa3 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -672,8 +672,6 @@ void ThreadsInternal::finalize() { s_threads_process.m_pool_size = 1; s_threads_process.m_pool_fan_size = 0; s_threads_process.m_pool_state = ThreadState::Inactive; - - Kokkos::Profiling::finalize(); } //---------------------------------------------------------------------------- From 54c62d15d6f4733fa2e9c461c17c1154eb0c6b14 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 29 Nov 2023 09:53:56 -0500 Subject: [PATCH 150/432] Replace ubuntu:18.04 with ubuntu:20.04 as base image for clang-format --- scripts/docker/Dockerfile.clang | 42 +++------------------------------ 1 file changed, 3 insertions(+), 39 deletions(-) diff --git a/scripts/docker/Dockerfile.clang b/scripts/docker/Dockerfile.clang index 5c6abc1c6de..b493c3bbff0 100644 --- a/scripts/docker/Dockerfile.clang +++ b/scripts/docker/Dockerfile.clang @@ -1,49 +1,13 @@ -FROM ubuntu:18.04 +FROM ubuntu:20.04 RUN apt-get update && apt-get install -y \ bc \ git \ build-essential \ + clang-format-8 \ wget \ - ccache \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ - KEYDUMP_FILE=keydump && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ - gpg --import ${KEYDUMP_FILE} && \ - gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \ - rm ${KEYDUMP_FILE}* - -ARG CMAKE_VERSION=3.16.8 -ENV CMAKE_DIR=/opt/cmake -RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ - CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ - CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ - gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ - grep -i ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sed -e s/linux/Linux/ | sha256sum --check && \ - mkdir -p ${CMAKE_DIR} && \ - sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ - rm cmake* -ENV PATH=${CMAKE_DIR}/bin:$PATH - -ENV LLVM_DIR=/opt/llvm -RUN LLVM_VERSION=8.0.0 && \ - LLVM_URL=https://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz && \ - LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \ - SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ - wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \ - wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \ - gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \ - mkdir -p ${LLVM_DIR} && \ - tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \ - echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \ - rm -rf /root/.gnupg && \ - rm -rf ${SCRATCH_DIR} -ENV PATH=${LLVM_DIR}/bin:$PATH +ENV CLANG_FORMAT_EXE=clang-format-8 From b00c1e06856af8dc2dede9057968a6ebb96d9960 Mon Sep 17 00:00:00 2001 From: tcclevenger Date: Wed, 29 Nov 2023 15:19:12 -0700 Subject: [PATCH 151/432] update comment to include final() mention --- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 64a97e283b7..29b1479a1fb 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -244,10 +244,10 @@ class ParallelReduce, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); } else if (word_count.value > 1) { - // Inside cuda_single_inter_block_reduce_scan() above, shared[i] below - // might have been updated by a single thread within a warp without - // synchronization afterwards. Synchronize threads within warp to avoid - // potential racecondition. + // Inside cuda_single_inter_block_reduce_scan() and final() above, + // shared[i] below might have been updated by a single thread within a + // warp without synchronization afterwards. Synchronize threads within + // warp to avoid potential race condition. __syncwarp(0xffffffff); } From 7739ca191b5271a0c21a80a9d346e1ac7c141dba Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 30 Nov 2023 11:13:15 -0500 Subject: [PATCH 152/432] Disabling OpenACC in the CI because it emits too many warnings --- .jenkins | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/.jenkins b/.jenkins index f5d0bfcf0e8..d2a7df9b692 100644 --- a/.jenkins +++ b/.jenkins @@ -27,32 +27,32 @@ pipeline { } stage('Build') { parallel { - stage('OPENACC-NVHPC-CUDA-12.2') { - agent { - dockerfile { - filename 'Dockerfile.nvhpc' - dir 'scripts/docker' - label 'nvidia-docker && volta && large_images' - args '--env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' - } - } - environment { - NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2' - } - steps { - sh '''rm -rf build && mkdir -p build && cd build && \ - /opt/cmake/bin/cmake \ - -DCMAKE_CXX_COMPILER=nvc++ \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_OPENACC=ON \ - -DKokkos_ARCH_VOLTA70=ON \ - .. && \ - make -j8 && ctest --verbose''' - } - } +// stage('OPENACC-NVHPC-CUDA-12.2') { +// agent { +// dockerfile { +// filename 'Dockerfile.nvhpc' +// dir 'scripts/docker' +// label 'nvidia-docker && volta && large_images' +// args '--env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' +// } +// } +// environment { +// NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2' +// } +// steps { +// sh '''rm -rf build && mkdir -p build && cd build && \ +// /opt/cmake/bin/cmake \ +// -DCMAKE_CXX_COMPILER=nvc++ \ +// -DCMAKE_CXX_STANDARD=17 \ +// -DKokkos_ARCH_NATIVE=ON \ +// -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ +// -DKokkos_ENABLE_TESTS=ON \ +// -DKokkos_ENABLE_OPENACC=ON \ +// -DKokkos_ARCH_VOLTA70=ON \ +// .. && \ +// make -j8 && ctest --verbose''' +// } +// } stage('CUDA-12.2-NVHPC') { agent { dockerfile { From e9899a5b1ba04f2e0820bf3a34fedb03a9c2587f Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Thu, 30 Nov 2023 16:25:47 +0000 Subject: [PATCH 153/432] unorderedmap: modernize traits --- containers/src/Kokkos_UnorderedMap.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index 4b3e9ce9386..78a6a238ece 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -243,16 +243,16 @@ class UnorderedMap { using const_map_type = UnorderedMap; - static const bool is_set = std::is_void::value; - static const bool has_const_key = - std::is_same::value; - static const bool has_const_value = - is_set || std::is_same::value; + static constexpr bool is_set = std::is_void_v; + static constexpr bool has_const_key = + std::is_same_v; + static constexpr bool has_const_value = + is_set || std::is_same_v; - static const bool is_insertable_map = + static constexpr bool is_insertable_map = !has_const_key && (is_set || !has_const_value); - static const bool is_modifiable_map = has_const_key && !has_const_value; - static const bool is_const_map = has_const_key && has_const_value; + static constexpr bool is_modifiable_map = has_const_key && !has_const_value; + static constexpr bool is_const_map = has_const_key && has_const_value; using insert_result = UnorderedMapInsertResult; From 52d5c37388b2af48a52d6f3b681a7a8dbe468414 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Fri, 1 Dec 2023 03:42:46 +0100 Subject: [PATCH 154/432] nvcc wrapper: remove troubling flag to fix 6628 (#6629) Remove nvcc_wrapper flag recognition that causes issues with cmake --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index 1b0fda12207..9b935835d5f 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|-dryrun|--verbose|-v|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args From ed64cea7fad930f5c8b22a82f79f005cd013c375 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Fri, 1 Dec 2023 13:18:10 +0000 Subject: [PATCH 155/432] tools(profiling): type (related to kokkos/kokkos-tools/pull/221) --- core/src/impl/Kokkos_Profiling_C_Interface.h | 4 ++-- core/src/impl/Kokkos_Profiling_Interface.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/impl/Kokkos_Profiling_C_Interface.h b/core/src/impl/Kokkos_Profiling_C_Interface.h index 731a11e917a..15c466b27ed 100644 --- a/core/src/impl/Kokkos_Profiling_C_Interface.h +++ b/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -154,7 +154,7 @@ enum Kokkos_Tools_OptimizationType { Kokkos_Tools_Maximize }; -struct Kokkos_Tools_OptimzationGoal { +struct Kokkos_Tools_OptimizationGoal { size_t type_id; enum Kokkos_Tools_OptimizationType goal; }; @@ -220,7 +220,7 @@ typedef void (*Kokkos_Tools_contextBeginFunction)(const size_t); typedef void (*Kokkos_Tools_contextEndFunction)( const size_t, struct Kokkos_Tools_VariableValue); typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)( - const size_t, const struct Kokkos_Tools_OptimzationGoal goal); + const size_t, const struct Kokkos_Tools_OptimizationGoal goal); struct Kokkos_Profiling_EventSet { Kokkos_Profiling_initFunction init; diff --git a/core/src/impl/Kokkos_Profiling_Interface.hpp b/core/src/impl/Kokkos_Profiling_Interface.hpp index af71932e47b..b66886d9f7e 100644 --- a/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -226,7 +226,7 @@ using ValueType = Kokkos_Tools_VariableInfo_ValueType; using CandidateValueType = Kokkos_Tools_VariableInfo_CandidateValueType; using SetOrRange = Kokkos_Tools_VariableInfo_SetOrRange; using VariableInfo = Kokkos_Tools_VariableInfo; -using OptimizationGoal = Kokkos_Tools_OptimzationGoal; +using OptimizationGoal = Kokkos_Tools_OptimizationGoal; using TuningString = Kokkos_Tools_Tuning_String; using VariableValue = Kokkos_Tools_VariableValue; From 685620918f93420d8e6ec0cd9b03c72dfd2e5a6e Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Fri, 1 Dec 2023 10:57:18 -0500 Subject: [PATCH 156/432] This PR fixes the too-much-OpenACC-warning issue, mentioned in PR #6639. This PR also re-enables the OpenACC CI test. --- .jenkins | 52 +++++++++---------- .../atomics/Compare_Exchange_OpenACC.hpp | 16 +++--- .../desul/atomics/Fetch_Op_OpenACC.hpp | 16 +++--- .../atomics/Lock_Based_Fetch_Op_OpenACC.hpp | 16 +++--- 4 files changed, 56 insertions(+), 44 deletions(-) diff --git a/.jenkins b/.jenkins index 9673f0fb44b..2fa3a68f8b4 100644 --- a/.jenkins +++ b/.jenkins @@ -32,32 +32,32 @@ pipeline { } stage('Build') { parallel { -// stage('OPENACC-NVHPC-CUDA-12.2') { -// agent { -// dockerfile { -// filename 'Dockerfile.nvhpc' -// dir 'scripts/docker' -// label 'nvidia-docker && volta && large_images' -// args '--env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' -// } -// } -// environment { -// NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2' -// } -// steps { -// sh '''rm -rf build && mkdir -p build && cd build && \ -// /opt/cmake/bin/cmake \ -// -DCMAKE_CXX_COMPILER=nvc++ \ -// -DCMAKE_CXX_STANDARD=17 \ -// -DKokkos_ARCH_NATIVE=ON \ -// -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -// -DKokkos_ENABLE_TESTS=ON \ -// -DKokkos_ENABLE_OPENACC=ON \ -// -DKokkos_ARCH_VOLTA70=ON \ -// .. && \ -// make -j8 && ctest --verbose''' -// } -// } + stage('OPENACC-NVHPC-CUDA-12.2') { + agent { + dockerfile { + filename 'Dockerfile.nvhpc' + dir 'scripts/docker' + label 'nvidia-docker && volta && large_images' + args '--env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + environment { + NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2' + } + steps { + sh '''rm -rf build && mkdir -p build && cd build && \ + /opt/cmake/bin/cmake \ + -DCMAKE_CXX_COMPILER=nvc++ \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_OPENACC=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + .. && \ + make -j8 && ctest --verbose''' + } + } stage('CUDA-12.2-NVHPC') { agent { dockerfile { diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp index 225079c15db..77149bd4741 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp @@ -33,9 +33,11 @@ T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) { return return_val; } else { // FIXME_OPENACC - printf( - "DESUL error in device_atomic_exchange(): Not supported atomic operation in " - "the OpenACC backend\n"); + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + } // Acquire a lock for the address // while (!lock_address_openacc((void*)dest, scope)) { // } @@ -73,9 +75,11 @@ T device_atomic_compare_exchange( return atomicCAS(dest, compare, value); } else { // FIXME_OPENACC - printf( - "DESUL error in device_atomic_compare_exchange(): Not supported atomic " - "operation in the OpenACC backend\n"); + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic " + "operation in the OpenACC backend\n"); + } T current_val = *dest; // Acquire a lock for the address // while (!lock_address_openacc((void*)dest, scope)) { diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp index 10294c423f9..ab570ac5787 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp @@ -390,9 +390,11 @@ std::enable_if_t, void> device_atomic_store( template std::enable_if_t, void> device_atomic_store( T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) { - printf( - "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " - "operation in the OpenACC backend\n"); + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " + "operation in the OpenACC backend\n"); + } #pragma acc atomic write *ptr = val; } @@ -411,9 +413,11 @@ std::enable_if_t, T> device_atomic_load( template std::enable_if_t, T> device_atomic_load( const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) { - printf( - "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " - "operation in the OpenACC backend\n"); + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " + "operation in the OpenACC backend\n"); + } T retval; #pragma acc atomic read retval = *ptr; diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp index 6b78ce39043..d4dd74588bd 100644 --- a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp @@ -29,9 +29,11 @@ inline T device_atomic_fetch_oper(const Oper& op, dont_deduce_this_parameter_t val, MemoryOrder /*order*/, MemoryScope scope) { - printf( - "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " - "the OpenACC backend\n"); + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " + "the OpenACC backend\n"); + } // Acquire a lock for the address while (!lock_address((void*)dest, scope)) { } @@ -56,9 +58,11 @@ inline T device_atomic_oper_fetch(const Oper& op, dont_deduce_this_parameter_t val, MemoryOrder /*order*/, MemoryScope scope) { - printf( - "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " - "the OpenACC backend\n"); + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " + "the OpenACC backend\n"); + } // Acquire a lock for the address while (!lock_address((void*)dest, scope)) { } From 07cdd7000fc57f818a4cb063524e9bc7cfc6bd72 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Mon, 4 Dec 2023 11:12:45 +0100 Subject: [PATCH 157/432] add missing header fix #6644 --- algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp | 1 + algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp | 1 + .../unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp | 1 + .../unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp | 1 + 4 files changed, 4 insertions(+) diff --git a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 6ab68a1987d..56387e3c92b 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { diff --git a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index 8e60a43e5ff..fa6294ea4ca 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index 9dac3ce75ff..f574832cc63 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index a90a68ca1d7..d96e582582f 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { From a41dba5860c019f503822854f5ba8485937e1733 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 4 Dec 2023 10:16:10 -0500 Subject: [PATCH 158/432] SYCL: Restrict workaround for is_device_copyable to oneAPI versions before 2024.0.0 (#6532) * SYCL: Restrict workaround for is_device_copyable to oneAPI versions before 2024.0.0 * Allow for checking __LIBSYCL_MAJOR_VERSION as well * Drop check for KOKKOS_ENABLE_SYCL * Fix __LIBSYCL_MAJOR_VERSION check --- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 7f9ce48109c..ab7e8ce71e0 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -333,8 +333,8 @@ struct sycl::is_device_copyable< Kokkos::Experimental::Impl::SYCLFunctionWrapper> : std::true_type {}; -// FIXME_SYCL Remove when this specialization when specializations for -// sycl::device_copyable also apply to const-qualified types. +#if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ + (defined(__LIBSYCL_MAJOR_VERSION) && __LIBSYCL_MAJOR_VERSION < 7) template struct NonTriviallyCopyableAndDeviceCopyable { NonTriviallyCopyableAndDeviceCopyable( @@ -359,3 +359,4 @@ struct sycl::is_device_copyable< : std::true_type {}; #endif #endif +#endif From 71729af719b2eb20b512efad05ae609e74e1dd7a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 5 Dec 2023 15:26:29 -0500 Subject: [PATCH 159/432] Fixup test math functions ulp should double -> int --- core/unit_test/TestMathematicalFunctions.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 3150a015b5d..39f17639b73 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -286,7 +286,7 @@ struct FloatingPointComparison { public: template - KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const { + KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, int ulp) const { auto abs_tol = eps(fpv) * ulp; bool ar = absolute(fpv) < abs_tol; @@ -299,8 +299,7 @@ struct FloatingPointComparison { } template - KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, - double ulp) const { + KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, int ulp) const { if (lhs == 0) { return compare_near_zero(rhs, ulp); } else if (rhs == 0) { @@ -347,7 +346,7 @@ struct math_function_name; } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -372,7 +371,7 @@ struct math_function_name; math_unary_function_return_type_t>::value); \ return REF_FUNC; \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -474,7 +473,7 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathBinaryFunction_##FUNC; \ template <> \ @@ -510,7 +509,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); math_ternary_function_return_type_t>::value); \ return std::FUNC(x, y, z); \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk3_##FUNC = MathTernaryFunction_##FUNC; \ template <> \ From b9b63dfd89fb670d9e18309ad25ef74c6bdd9400 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 5 Dec 2023 22:00:57 -0500 Subject: [PATCH 160/432] Drop DualView converting copy assignment operator It is unnecessary, we already have a converting constructor and a regular copy assignment operator. --- containers/src/Kokkos_DualView.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index 84bced2cc44..e821570a8d5 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -292,15 +292,6 @@ class DualView : public ViewTraits { d_view(src.d_view), h_view(src.h_view) {} - //! Copy assignment operator (shallow copy assignment) - template - DualView& operator=(const DualView& src) { - modified_flags = src.modified_flags; - d_view = src.d_view; - h_view = src.h_view; - return *this; - } - //! Subview constructor template DualView(const DualView& src, const Arg0& arg0, Args... args) From 9fd95ebcb1d2f590d376e42ca993068138c0c829 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 6 Dec 2023 11:41:48 -0500 Subject: [PATCH 161/432] Don't use rocm-docker for clang-format --- .jenkins | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins b/.jenkins index 9673f0fb44b..2d75e2a6a1b 100644 --- a/.jenkins +++ b/.jenkins @@ -22,7 +22,7 @@ pipeline { dockerfile { filename 'Dockerfile.clang' dir 'scripts/docker' - label 'nvidia-docker || rocm-docker || docker' + label 'nvidia-docker || docker' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } From dcdfcac91f76c9c0fba45c8c094c6153187e8766 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 6 Dec 2023 11:42:08 -0500 Subject: [PATCH 162/432] Diable HIP CI --- .jenkins | 152 +++++++++++++++++++++++++++---------------------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/.jenkins b/.jenkins index 2d75e2a6a1b..6374f4b6763 100644 --- a/.jenkins +++ b/.jenkins @@ -135,82 +135,82 @@ pipeline { } } } - stage('HIP-ROCm-5.2') { - agent { - dockerfile { - filename 'Dockerfile.hipcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' - label 'rocm-docker && vega' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' - } - } - environment { - OMP_NUM_THREADS = 8 - OMP_MAX_ACTIVE_LEVELS = 3 - OMP_PLACES = 'threads' - OMP_PROC_BIND = 'spread' - } - steps { - sh 'ccache --zero-stats' - sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' - sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_HIP=ON \ - -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ - .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('HIP-ROCm-5.6-C++20') { - agent { - dockerfile { - filename 'Dockerfile.hipcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' - label 'rocm-docker && vega' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' - } - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ - -DCMAKE_CXX_STANDARD=20 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_HIP=ON \ - .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } +// stage('HIP-ROCm-5.2') { +// agent { +// dockerfile { +// filename 'Dockerfile.hipcc' +// dir 'scripts/docker' +// additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' +// label 'rocm-docker && vega' +// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' +// } +// } +// environment { +// OMP_NUM_THREADS = 8 +// OMP_MAX_ACTIVE_LEVELS = 3 +// OMP_PLACES = 'threads' +// OMP_PROC_BIND = 'spread' +// } +// steps { +// sh 'ccache --zero-stats' +// sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' +// sh '''rm -rf build && mkdir -p build && cd build && \ +// cmake \ +// -DCMAKE_BUILD_TYPE=Debug \ +// -DCMAKE_CXX_COMPILER=hipcc \ +// -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \ +// -DCMAKE_CXX_STANDARD=17 \ +// -DKokkos_ARCH_NATIVE=ON \ +// -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ +// -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ +// -DKokkos_ENABLE_TESTS=ON \ +// -DKokkos_ENABLE_BENCHMARKS=ON \ +// -DKokkos_ENABLE_HIP=ON \ +// -DKokkos_ENABLE_OPENMP=ON \ +// -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ +// .. && \ +// make -j8 && ctest --verbose''' +// } +// post { +// always { +// sh 'ccache --show-stats' +// } +// } +// } +// stage('HIP-ROCm-5.6-C++20') { +// agent { +// dockerfile { +// filename 'Dockerfile.hipcc' +// dir 'scripts/docker' +// additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' +// label 'rocm-docker && vega' +// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' +// } +// } +// steps { +// sh 'ccache --zero-stats' +// sh '''rm -rf build && mkdir -p build && cd build && \ +// cmake \ +// -DCMAKE_BUILD_TYPE=RelWithDebInfo \ +// -DCMAKE_CXX_COMPILER=hipcc \ +// -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ +// -DCMAKE_CXX_STANDARD=20 \ +// -DKokkos_ARCH_NATIVE=ON \ +// -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ +// -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ +// -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ +// -DKokkos_ENABLE_TESTS=ON \ +// -DKokkos_ENABLE_BENCHMARKS=ON \ +// -DKokkos_ENABLE_HIP=ON \ +// .. && \ +// make -j8 && ctest --verbose''' +// } +// post { +// always { +// sh 'ccache --show-stats' +// } +// } +// } /* stage('OPENMPTARGET-ROCm-5.2') { agent { From eadc210bf4ae3d06c4863e4c7001abf5732d3b4f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 6 Dec 2023 22:00:33 +0000 Subject: [PATCH 163/432] Remove deprecation warning for AllocationMechanism for gcc <11.0 --- core/src/Kokkos_HostSpace.hpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index c20bb1abc60..d04343de292 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -75,12 +75,19 @@ class HostSpace { /**\brief Non-default memory space instance to choose allocation mechansim, * if available */ - enum KOKKOS_DEPRECATED AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; +#if defined(KOKKOS_COMPILER_GNU) && KOKKOS_COMPILER_GNU < 1100 + // We see deprecation warnings even when not using the deprecated + // HostSpace constructor below when using gcc before release 11. + enum +#else + enum KOKKOS_DEPRECATED +#endif + AllocationMechanism { + STD_MALLOC, + POSIX_MEMALIGN, + POSIX_MMAP, + INTEL_MM_ALLOC + }; KOKKOS_DEPRECATED explicit HostSpace(const AllocationMechanism&); From 843fca336a8bc056b6650310f6c5152f06e3b7e7 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 7 Dec 2023 19:15:11 -0800 Subject: [PATCH 164/432] OpenMPTarget: clang extensions for dynamic shared memory. (#6380) * OpenMPTarget: clang extensions for dynamic shared memory. * OpenMPTarget: Rebase and use macros for pragma. * OpenMPTarget: Fix bug. Add Macros file. * OpenMPTarget: Fix resize_scratch. * OpenMPTarget: restore map clause. * OpenMPTarget: Fix comment. * OpenMPTarget: change L0 values in resize_scratch. * OpenMPTarget: Fix comment and spaces. --------- Co-authored-by: Rahulkumar Gayatri --- .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 5 ++ .../Kokkos_OpenMPTarget_Macros.hpp | 46 +++++++++++++++++++ .../Kokkos_OpenMPTarget_Parallel.hpp | 35 ++++++++++---- .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 7 ++- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 34 ++++++++------ 5 files changed, 102 insertions(+), 25 deletions(-) create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index ea434b39533..b39f5aca353 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -99,6 +99,11 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, int64_t shmem_size_L1, int64_t league_size) { Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif const int64_t shmem_size = shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. const int64_t padding = shmem_size * 10 / 100; // Padding per team. diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp new file mode 100644 index 00000000000..2bd672f4d06 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP +#define KOKKOS_OPENMPTARGET_MACROS_HPP + +// Intel architectures prefer the classical hierarchical parallelism that relies +// on OpenMP. +#if defined(KOKKOS_ARCH_INTEL_GPU) +#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU +#endif + +// Define a macro for llvm compiler greater than version 17 and on NVIDIA and +// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm +// extensions can be used. +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \ + (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)) +#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#endif + +#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x) +#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \ + KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x) + +// Use scratch memory extensions to request dynamic shared memory for the +// right compiler/architecture combination. +#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N) +#else +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) +#endif + +#endif // KOKKOS_OPENMPTARGET_MACROS_HPP diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 2a7063b966a..466dee2a563 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -24,12 +24,7 @@ #include #include "Kokkos_OpenMPTarget_Abort.hpp" - -// Intel architectures prefer the classical hierarchical parallelism that relies -// on OpenMP. -#if defined(KOKKOS_ARCH_INTEL_GPU) -#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU -#endif +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -248,15 +243,37 @@ class OpenMPTargetExecTeamMember { // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for // hierarchical reduction. There is an additional 10% of the requested // scratch memory allocated per team as padding. Hence the product with 0.1. + // + // Use llvm extensions for dynamic shared memory with compilers/architecture + // combinations where it is supported. + // + // Size allocated in HBM will now change based on whether we use llvm + // extensions. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1; +#else + const int total_shmem = + shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1; +#endif + + // Per team offset for buffer in HBM. const int reduce_offset = - m_shmem_block_index * - (shmem_size_L0 + shmem_size_L1 + - ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); + m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE); + +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE; + char* l0_scratch = + static_cast(llvm_omp_target_dynamic_shared_alloc()); + m_team_shared = scratch_memory_space( + l0_scratch, shmem_size_L0, static_cast(glb_scratch) + l1_offset, + shmem_size_L1); +#else const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; const int l1_offset = l0_offset + shmem_size_L0; m_team_shared = scratch_memory_space( (static_cast(glb_scratch) + l0_offset), shmem_size_L0, static_cast(glb_scratch) + l1_offset, shmem_size_L1); +#endif m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; m_league_rank = league_rank; m_team_rank = omp_tid; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 1abc925caed..26085f11400 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -140,8 +141,10 @@ class ParallelFor, // guarantees that the number of teams specified in the `num_teams` clause is // always less than or equal to the maximum concurrently running teams. #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \ - num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams thread_limit(team_size) firstprivate(a_functor) + num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel { if (omp_get_num_teams() > max_active_teams) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index fb75f05f270..eb3dc3773c4 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -394,9 +395,11 @@ struct ParallelReduceSpecialize, initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(max_active_teams) thread_limit(team_size) + firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom + : result) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -482,9 +485,11 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+: result) + // Use scratch memory extensions to request dynamic shared memory for + // the right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \ + is_device_ptr(scratch_ptr) reduction(+: result) \ + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(+ : result) { if (omp_get_num_teams() > max_active_teams) @@ -636,11 +641,13 @@ struct ParallelReduceSpecialize, return; } - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) - { + // Use scratch memory extensions to request dynamic shared memory for the + // right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(nteams) thread_limit(team_size) map(to + : f) + is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) { #pragma omp parallel { const int team_num = omp_get_team_num(); @@ -665,9 +672,8 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : final_reducer) \ - is_device_ptr(scratch_ptr) +#pragma omp target teams distribute parallel for simd firstprivate( \ + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); From fb0380b91b79b5744ea2b12e28a5663c59aeacc4 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sat, 9 Dec 2023 08:14:14 -0700 Subject: [PATCH 165/432] Fix builtin_unreachable use for MSVC/CUDA Also split math functions test differently to avoid need for bigobj --- .../unit_tests/TestStdAlgorithmsCommon.hpp | 2 +- core/unit_test/TestMathematicalFunctions.hpp | 22 ++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index b962218b5f0..ada634462f0 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -199,7 +199,7 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) { // this is needed for intel to avoid // error #1011: missing return statement at end of non-void function #if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 3150a015b5d..680833818be 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -31,7 +31,7 @@ #endif #if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && !defined(KOKKOS_COMPILER_MSVC)) #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable(); #else #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE @@ -394,10 +394,12 @@ DEFINE_UNARY_FUNCTION_EVAL(log2, 2); DEFINE_UNARY_FUNCTION_EVAL(log1p, 2); #endif -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2); DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2); +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 DEFINE_UNARY_FUNCTION_EVAL(sin, 2); DEFINE_UNARY_FUNCTION_EVAL(cos, 2); DEFINE_UNARY_FUNCTION_EVAL(tan, 2); @@ -483,11 +485,9 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); }; \ constexpr char math_function_name::name[] -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_BINARY_FUNCTION_EVAL(pow, 2); DEFINE_BINARY_FUNCTION_EVAL(hypot, 2); -#endif -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1); DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); #endif @@ -519,7 +519,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); }; \ constexpr char math_function_name::name[] -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_TERNARY_FUNCTION_EVAL(hypot, 2); DEFINE_TERNARY_FUNCTION_EVAL(fma, 2); #endif @@ -787,7 +787,9 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) { // TODO atan2 } +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 TEST(TEST_CATEGORY, mathematical_functions_power_functions) { TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11}); TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l}); @@ -1558,6 +1560,7 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) { // TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 template struct TestIsFinite { TestIsFinite() { run(); } @@ -1581,6 +1584,7 @@ struct TestIsFinite { ++e; Kokkos::printf("failed isfinite(float)\n"); } +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if (!isfinite(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isfinite(quiet_NaN::value) || @@ -1601,6 +1605,7 @@ struct TestIsFinite { ++e; Kokkos::printf("failed isfinite(KE::bhalf_t)\n"); } +#endif if (!isfinite(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isfinite(quiet_NaN::value) || @@ -1660,6 +1665,7 @@ struct TestIsInf { ++e; Kokkos::printf("failed isinf(float)\n"); } +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if (isinf(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isinf(quiet_NaN::value) || @@ -1680,6 +1686,7 @@ struct TestIsInf { ++e; Kokkos::printf("failed isinf(KE::bhalf_t)\n"); } +#endif if (isinf(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isinf(quiet_NaN::value) || @@ -1738,6 +1745,7 @@ struct TestIsNaN { ++e; Kokkos::printf("failed isnan(float)\n"); } +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if (isnan(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || @@ -1767,6 +1775,7 @@ struct TestIsNaN { ++e; Kokkos::printf("failed isnan(double)\n"); } +#endif #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (isnan(4.l) || !isnan(quiet_NaN::value) || !isnan(signaling_NaN::value) || @@ -1793,6 +1802,7 @@ struct TestIsNaN { TEST(TEST_CATEGORY, mathematical_functions_isnan) { TestIsNaN(); } +#endif // TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279 #endif From 458910fbf37ba7fae78e54a2e5b78522cb1b2ccb Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sat, 9 Dec 2023 08:14:32 -0700 Subject: [PATCH 166/432] Fix missing include on msvc/cuda --- core/src/Serial/Kokkos_Serial.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/Serial/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp index db1567610b2..75e1e1eac55 100644 --- a/core/src/Serial/Kokkos_Serial.hpp +++ b/core/src/Serial/Kokkos_Serial.hpp @@ -25,6 +25,7 @@ static_assert(false, #ifndef KOKKOS_SERIAL_HPP #define KOKKOS_SERIAL_HPP +#include #include #if defined(KOKKOS_ENABLE_SERIAL) From 7dcf1deba623ce533f905beb9f37bc26e2009144 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sat, 9 Dec 2023 08:15:00 -0700 Subject: [PATCH 167/432] Avoid lambdas in constexpr branch for msvc/cuda --- simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 4feff3a89d2..bf28ffd44d9 100644 --- a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -42,6 +42,7 @@ inline void host_check_gen_ctor() { simd_type blend; blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if constexpr (std::is_same_v) { simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); host_check_equality(basic, rhs, lanes); @@ -63,6 +64,7 @@ inline void host_check_gen_ctor() { host_check_equality(blend, result, lanes); } +#endif } template From 249f8b4fb42679d37ee4cca9d0c3012739050ac2 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sat, 9 Dec 2023 08:15:18 -0700 Subject: [PATCH 168/432] Sidestep lacking CTAD support msvc/cuda --- algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp | 2 +- algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp | 2 +- algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp | 2 +- algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp | 2 +- algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp | 2 +- algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp | 2 +- algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp | 2 +- algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index b32a9be3a17..b5aa27c7c38 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -139,7 +139,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < sourceView.extent(0); ++i) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp index 1928f955880..21da333e75d 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp @@ -191,7 +191,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, // ----------------------------------------------- auto returnView_h = create_host_space_copy(returnView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) { auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL()); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp index c0bbdfa3904..78ab6bf1f8d 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp @@ -240,7 +240,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, "stdDestTrueView", numTeams, numCols); Kokkos::View stdDestFalseView( "stdDestFalseView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) { auto myRowSource = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp index 954d4612468..370e91cc1ff 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp @@ -197,7 +197,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, auto distancesView_h = create_host_space_copy(distancesView); auto dataViewAfterOp_h = create_host_space_copy(dataView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) { auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL()); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index 2082fa97288..ce18eb4d319 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) { auto rowFrom = Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp index 3315f281da6..3dd7cb764c6 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp @@ -127,7 +127,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { // ----------------------------------------------- // check against std // ----------------------------------------------- - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); auto dataViewAfterOp_h = create_host_space_copy(dataView); auto distancesView_h = create_host_space_copy(distancesView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index ae43a2a4269..d0217aed7a8 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -145,7 +145,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < sourceView.extent(0); ++i) { auto rowFrom = Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp index 1d5d9578f94..d79b53d3551 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp @@ -103,7 +103,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j); } } - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < dataView.extent(0); ++i) { auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL()); std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal); From c6d01e9435c0f16c84e8d191d723ebe5ef863cf7 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sat, 9 Dec 2023 08:28:32 -0700 Subject: [PATCH 169/432] Fix formatting --- algorithms/unit_tests/TestStdAlgorithmsCommon.hpp | 5 +++-- core/unit_test/TestMathematicalFunctions.hpp | 5 +++-- simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index ada634462f0..3eb963faf2d 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -198,8 +198,9 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) { // this is needed for intel to avoid // error #1011: missing return statement at end of non-void function -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && !defined(KOKKOS_COMPILER_MSVC)) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 680833818be..a17f38aa68b 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -30,8 +30,9 @@ #define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS #endif -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && !defined(KOKKOS_COMPILER_MSVC)) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable(); #else #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE diff --git a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index bf28ffd44d9..4af08c266bb 100644 --- a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -42,7 +42,7 @@ inline void host_check_gen_ctor() { simd_type blend; blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); -#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if constexpr (std::is_same_v) { simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); host_check_equality(basic, rhs, lanes); From e524ec7777bb2f6f06391bbea0bc007f624ee599 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sat, 9 Dec 2023 15:42:20 -0700 Subject: [PATCH 170/432] Move header for Damien because he is right --- core/src/Serial/Kokkos_Serial.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Serial/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp index 75e1e1eac55..67119cac164 100644 --- a/core/src/Serial/Kokkos_Serial.hpp +++ b/core/src/Serial/Kokkos_Serial.hpp @@ -25,12 +25,12 @@ static_assert(false, #ifndef KOKKOS_SERIAL_HPP #define KOKKOS_SERIAL_HPP -#include #include #if defined(KOKKOS_ENABLE_SERIAL) #include #include +#include #include #include #include From ed08974c76ab00a28897c31b876dbd56d283d86d Mon Sep 17 00:00:00 2001 From: mperrinel Date: Mon, 11 Dec 2023 15:59:06 +0100 Subject: [PATCH 171/432] Unit test for issue 3371 (negative vector length should not yield a negative max_team_size) (#6076) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * #3371: Added TeamPolicy construction test * Check recommended team size * Move size_max and size_recommended tests to TeamPolicy constructor test * Define tag type outside of test body * Add missing overload * tests: remove spurious check * Remove redundant qualifier Co-authored-by: Damien L-G * Fix formatting --------- Co-authored-by: Cezary Skrzyński Co-authored-by: Cezary Skrzyński Co-authored-by: Damien L-G --- core/unit_test/TestTeamPolicyConstructors.hpp | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestTeamPolicyConstructors.hpp b/core/unit_test/TestTeamPolicyConstructors.hpp index 5b0bfdb1755..9d89f757086 100644 --- a/core/unit_test/TestTeamPolicyConstructors.hpp +++ b/core/unit_test/TestTeamPolicyConstructors.hpp @@ -20,11 +20,24 @@ namespace { +struct SomeTag {}; + +struct FunctorFor { + KOKKOS_FUNCTION + void operator()( + Kokkos::TeamPolicy::member_type const&) const {} + + KOKKOS_FUNCTION + void operator()( + SomeTag, Kokkos::TeamPolicy::member_type const&) const {} +}; + template void test_run_time_parameters() { int league_size = 131; using ExecutionSpace = typename Policy::execution_space; + using ParallelTag = Kokkos::ParallelForTag; int team_size = 4 < ExecutionSpace().concurrency() ? 4 : ExecutionSpace().concurrency(); #ifdef KOKKOS_ENABLE_HPX @@ -44,6 +57,8 @@ void test_run_time_parameters() { ASSERT_EQ(p1.team_size(), team_size); ASSERT_GT(p1.chunk_size(), 0); ASSERT_EQ(p1.scratch_size(0), 0u); + ASSERT_GT(p1.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p1.team_size_recommended(FunctorFor(), ParallelTag()), 0); Policy p2 = p1.set_chunk_size(chunk_size); ASSERT_EQ(p1.league_size(), league_size); @@ -112,6 +127,8 @@ void test_run_time_parameters() { Policy p8; // default constructed ASSERT_EQ(p8.league_size(), 0); ASSERT_EQ(p8.scratch_size(0), 0u); + ASSERT_GT(p8.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p8.team_size_recommended(FunctorFor(), ParallelTag()), 0); p8 = p3; // call assignment operator ASSERT_EQ(p3.league_size(), league_size); ASSERT_EQ(p3.team_size(), team_size); @@ -121,11 +138,25 @@ void test_run_time_parameters() { ASSERT_EQ(p8.team_size(), team_size); ASSERT_EQ(p8.chunk_size(), chunk_size); ASSERT_EQ(p8.scratch_size(0), size_t(scratch_size)); + + Policy p9(league_size, Kokkos::AUTO); + ASSERT_EQ(p9.league_size(), league_size); + ASSERT_GT(p9.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p9.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p10(league_size, team_size, Kokkos::AUTO); + ASSERT_EQ(p10.league_size(), league_size); + ASSERT_EQ(p10.team_size(), team_size); + ASSERT_GT(p10.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p10.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p11(league_size, Kokkos::AUTO, Kokkos::AUTO); + ASSERT_EQ(p11.league_size(), league_size); + ASSERT_GT(p11.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p11.team_size_recommended(FunctorFor(), ParallelTag()), 0); } TEST(TEST_CATEGORY, team_policy_runtime_parameters) { - struct SomeTag {}; - using TestExecSpace = TEST_EXECSPACE; using DynamicSchedule = Kokkos::Schedule; using LongIndex = Kokkos::IndexType; From 379d5db1a67a1bad6d7d9c8e25a6933655fc8c5e Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 12 Dec 2023 10:29:52 -0700 Subject: [PATCH 172/432] Add CMakeLists.txt for stream benchmark --- benchmarks/CMakeLists.txt | 1 + benchmarks/stream/CMakeLists.txt | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 benchmarks/stream/CMakeLists.txt diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index cccf7c759e1..bf946714d21 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,2 +1,3 @@ KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) diff --git a/benchmarks/stream/CMakeLists.txt b/benchmarks/stream/CMakeLists.txt new file mode 100644 index 00000000000..0dded6e3a54 --- /dev/null +++ b/benchmarks/stream/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + stream + SOURCES stream-kokkos.cpp +) From 76ea3a3a95e4619b80e3adf7edda6db05d4ed32d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 13 Dec 2023 06:58:04 -0700 Subject: [PATCH 173/432] Do not negate the dependent true traits helper --- .../OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 3 +-- core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp | 3 +-- core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp | 9 +++------ core/src/OpenACC/Kokkos_OpenACC_Team.hpp | 6 +++--- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 2c7793dc116..ec74cfa2e53 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -31,8 +31,7 @@ template ::value, - "not implemented"); + static_assert(std::is_void_v, "not implemented"); } }; } // namespace Kokkos::Experimental::Impl diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index b61a05a8ee1..85267cb10a0 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -31,8 +31,7 @@ template > struct OpenACCParallelReduceHelper { OpenACCParallelReduceHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(!Kokkos::Impl::always_true::value, - "not implemented"); + static_assert(std::is_void_v, "not implemented"); } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 3223ce3f9af..ce4d3c62ac9 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -40,8 +40,7 @@ template ::value, - "not implemented"); + static_assert(std::is_void_v, "not implemented"); } }; @@ -129,8 +128,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, - "custom reduction is not implemented"); + static_assert(std::is_void_v, "custom reduction is not implemented"); } // Hierarchical Parallelism -> Thread vector level implementation @@ -140,8 +138,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, - "custom reduction is not implemented"); + static_assert(std::is_void_v, "custom reduction is not implemented"); } } // namespace Kokkos diff --git a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp index 4ec71f56ef6..f2fa481f1dc 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp @@ -82,7 +82,7 @@ class OpenACCTeamMember { // FIXME_OPENACC: team_broadcast() is not implemented. template KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(std::is_void_v, "Kokkos Error: team_broadcast() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -99,7 +99,7 @@ class OpenACCTeamMember { template KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value, const JoinOp& op_in) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(std::is_void_v, "Kokkos Error: team_reduce() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -110,7 +110,7 @@ class OpenACCTeamMember { KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { static_assert( - !Kokkos::Impl::always_true::value, + std::is_void_v, "Kokkos Error: team_scan() is not implemented for the OpenACC backend"); return ArgType(); } From ae71e400213ae256c352e3e8c46cdc9db6d089bf Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 8 Dec 2023 09:05:20 -0500 Subject: [PATCH 174/432] Drop guards to accommodate external code defining KOKKOS_ASSERT --- core/src/Kokkos_Assert.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/src/Kokkos_Assert.hpp b/core/src/Kokkos_Assert.hpp index c3b9004734a..6fea286005e 100644 --- a/core/src/Kokkos_Assert.hpp +++ b/core/src/Kokkos_Assert.hpp @@ -44,9 +44,6 @@ __LINE__) " \n"); \ } \ } -// some projects already define this for themselves, so don't mess -// them up -#ifndef KOKKOS_ASSERT #define KOKKOS_ASSERT(...) \ { \ if (!bool(__VA_ARGS__)) { \ @@ -58,8 +55,7 @@ __LINE__) " \n"); \ } \ } -#endif // ifndef KOKKOS_ASSERT -#else // not debug mode +#else // not debug mode #define KOKKOS_EXPECTS(...) #define KOKKOS_ENSURES(...) #ifndef KOKKOS_ASSERT From a996c12a0315957458df283b5ea797b3e10573ee Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 13 Dec 2023 09:04:40 -0700 Subject: [PATCH 175/432] Use omp_get_max_active_levels() when supported --- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 583beee3252..c0eb1c9f4d9 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -45,8 +45,13 @@ namespace Kokkos { namespace Impl { inline bool execute_in_serial(OpenMP const& space = OpenMP()) { - return (OpenMP::in_parallel(space) && - !(omp_get_nested() && (omp_get_level() == 1))); + return (OpenMP::in_parallel(space) && !( +#if _OPENMP >= 201511 + (omp_get_max_active_levels() > 1) +#else + omp_get_nested() +#endif + && (omp_get_level() == 1))); } } // namespace Impl From 293319c5844f4d8eea51eb9cd1457115a5016d3f Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Wed, 13 Dec 2023 12:33:42 -0700 Subject: [PATCH 176/432] Add missing gfx940 --- Makefile.kokkos | 9 ++++++++- cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 6 +++--- core/src/HIP/Kokkos_HIP_Instance.hpp | 3 ++- generate_makefile.bash | 1 + 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 97b92a32892..506e3339bfc 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -13,7 +13,7 @@ KOKKOS_DEVICES ?= "Threads" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100 +# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC KOKKOS_ARCH ?= "" @@ -406,6 +406,8 @@ endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) @@ -1099,6 +1101,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 +endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 3713d269fa0..321678dcf6f 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -112,6 +112,7 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX906 #cmakedefine KOKKOS_ARCH_AMD_GFX908 #cmakedefine KOKKOS_ARCH_AMD_GFX90A +#cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 920ce8eadfc..b5a8e38950a 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -94,9 +94,9 @@ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR K ENDIF() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942) +LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300) +LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940) +LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940) LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index ef140ec46c0..63ad66686bb 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -30,7 +30,8 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ - defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ + defined(KOKKOS_ARCH_AMD_GFX942) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ diff --git a/generate_makefile.bash b/generate_makefile.bash index 1b216d9fe35..301a1fceb5a 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -160,6 +160,7 @@ display_help_text() { echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906" echo " AMD_GFX908 = AMD GPU MI100 GFX908" echo " AMD_GFX90A = AMD GPU MI200 GFX90A" + echo " AMD_GFX940 = AMD GPU MI300 GFX940" echo " AMD_GFX942 = AMD GPU MI300 GFX942" echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030" echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100" From 33db3046a173b5e96d1dff62c8555a8da5cea9bb Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 14 Dec 2023 09:47:23 -0700 Subject: [PATCH 177/432] Add Impl::always_false type-dendent false trait --- core/src/impl/Kokkos_Utilities.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/src/impl/Kokkos_Utilities.hpp b/core/src/impl/Kokkos_Utilities.hpp index 7e2f130564f..cadeed1a6d8 100644 --- a/core/src/impl/Kokkos_Utilities.hpp +++ b/core/src/impl/Kokkos_Utilities.hpp @@ -49,6 +49,11 @@ struct integral_constant { template struct always_true : std::true_type {}; +// type-dependent expression that is always false intended for use in +// static_assert to check "we should never get there" +template +struct always_false : std::false_type {}; + //============================================================================== #if defined(__cpp_lib_type_identity) From e4a7cfc78f0564f2aefb8a98cab3144baffd15b6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 14 Dec 2023 09:50:27 -0700 Subject: [PATCH 178/432] Per review prefer always_false::value to is_void_v --- .../OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 3 ++- core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp | 3 ++- core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp | 9 ++++++--- core/src/OpenACC/Kokkos_OpenACC_Team.hpp | 6 +++--- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index ec74cfa2e53..b02ad8dfd95 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -31,7 +31,8 @@ template , "not implemented"); + static_assert(Kokkos::Impl::always_false::value, + "not implemented"); } }; } // namespace Kokkos::Experimental::Impl diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index 85267cb10a0..493316050b0 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -31,7 +31,8 @@ template > struct OpenACCParallelReduceHelper { OpenACCParallelReduceHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(std::is_void_v, "not implemented"); + static_assert(Kokkos::Impl::always_false::value, + "not implemented"); } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index ce4d3c62ac9..1000a733b89 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -40,7 +40,8 @@ template , "not implemented"); + static_assert(Kokkos::Impl::always_false::value, + "not implemented"); } }; @@ -128,7 +129,8 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(std::is_void_v, "custom reduction is not implemented"); + static_assert(Kokkos::Impl::always_false::value, + "custom reduction is not implemented"); } // Hierarchical Parallelism -> Thread vector level implementation @@ -138,7 +140,8 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(std::is_void_v, "custom reduction is not implemented"); + static_assert(Kokkos::Impl::always_false::value, + "custom reduction is not implemented"); } } // namespace Kokkos diff --git a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp index f2fa481f1dc..20ea392452b 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp @@ -82,7 +82,7 @@ class OpenACCTeamMember { // FIXME_OPENACC: team_broadcast() is not implemented. template KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const { - static_assert(std::is_void_v, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_broadcast() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -99,7 +99,7 @@ class OpenACCTeamMember { template KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value, const JoinOp& op_in) const { - static_assert(std::is_void_v, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_reduce() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -110,7 +110,7 @@ class OpenACCTeamMember { KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { static_assert( - std::is_void_v, + Kokkos::Impl::always_false::value, "Kokkos Error: team_scan() is not implemented for the OpenACC backend"); return ArgType(); } From 316ceac58eedf97fec5ee9f2fb87018a83fb97ec Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 15 Dec 2023 11:54:35 -0700 Subject: [PATCH 179/432] Improve "no copy mechanism" exception message --- core/src/Kokkos_CopyViews.hpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 3f02748c9cc..7ba17051322 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -612,12 +612,17 @@ void view_copy(const DstType& dst, const SrcType& src) { }; if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) { - std::string message( - "Error: Kokkos::deep_copy with no available copy mechanism: "); - message += src.label(); - message += " to "; - message += dst.label(); - Kokkos::Impl::throw_runtime_exception(message); + std::ostringstream ss; + ss << "Error: Kokkos::deep_copy with no available copy mechanism: "; + ss << "from src (\"" << src.label() << "\") to dst (\"" << dst.label() + << "\").\n"; + ss << "There is no common execution space that can access both src's " + "space\n"; + ss << "(" << src_memory_space().name() << ") and dst's space (" + << dst_memory_space().name() << "), "; + ss << "so src and dst\n"; + ss << "must be contiguous and have the same layout.\n"; + Kokkos::Impl::throw_runtime_exception(ss.str()); } // Figure out iteration order in case we need it From 838f8938e9aaf1ac78027575743bb95b442470a9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 15 Dec 2023 14:09:02 -0700 Subject: [PATCH 180/432] Add a unit test for new deep_copy exception msg --- core/unit_test/TestViewCopy_a.hpp | 34 +++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/core/unit_test/TestViewCopy_a.hpp b/core/unit_test/TestViewCopy_a.hpp index 3bfc93aadac..a4735b29988 100644 --- a/core/unit_test/TestViewCopy_a.hpp +++ b/core/unit_test/TestViewCopy_a.hpp @@ -147,6 +147,40 @@ TEST(TEST_CATEGORY, view_copy_tests) { Kokkos::deep_copy(s_a, hs_a); ASSERT_TRUE(run_check(s_a, 6)); } + } else { + // These copies won't succeed, but they should each throw + // an exception whose message contains the view labels, + // and the names of the views' memory spaces. + // + // Note: original a,b both have the same device type, + // and their mirrors have the same device type. + using memory_space = typename decltype(a)::memory_space; + using mirror_memory_space = typename decltype(h_a)::memory_space; + bool threw = false; + std::string msg; + try { + Kokkos::deep_copy(hs_b, s_b); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(hs_b.label()), std::string::npos); + ASSERT_NE(msg.find(s_b.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); + threw = false; + try { + Kokkos::deep_copy(s_a, hs_a); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(s_a.label()), std::string::npos); + ASSERT_NE(msg.find(hs_a.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); } // Contiguous copies From 72bc7ed42f0f658ebc3b158520feba086111fc3b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 15 Dec 2023 15:15:59 -0700 Subject: [PATCH 181/432] Add missing include sstream --- core/src/Kokkos_CopyViews.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 7ba17051322..b636a5c2f04 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -22,6 +22,7 @@ static_assert(false, #ifndef KOKKOS_COPYVIEWS_HPP_ #define KOKKOS_COPYVIEWS_HPP_ #include +#include #include #include #include From 154a57df8c53bff88dc4806967cb8a49d8ed1c40 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 18 Dec 2023 09:38:08 -0700 Subject: [PATCH 182/432] src->source, dst->destination and reformat a bit --- core/src/Kokkos_CopyViews.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index b636a5c2f04..6bc6485c769 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -614,15 +614,15 @@ void view_copy(const DstType& dst, const SrcType& src) { if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) { std::ostringstream ss; - ss << "Error: Kokkos::deep_copy with no available copy mechanism: "; - ss << "from src (\"" << src.label() << "\") to dst (\"" << dst.label() - << "\").\n"; - ss << "There is no common execution space that can access both src's " - "space\n"; - ss << "(" << src_memory_space().name() << ") and dst's space (" - << dst_memory_space().name() << "), "; - ss << "so src and dst\n"; - ss << "must be contiguous and have the same layout.\n"; + ss << "Error: Kokkos::deep_copy with no available copy mechanism: " + << "from source view (\"" << src.label() << "\") to destination view (\"" + << dst.label() << "\").\n" + << "There is no common execution space that can access both source's " + "space\n" + << "(" << src_memory_space().name() << ") and destination's space (" + << dst_memory_space().name() << "), " + << "so source and destination\n" + << "must be contiguous and have the same layout.\n"; Kokkos::Impl::throw_runtime_exception(ss.str()); } From 5df22b87b046290e84c1c6a2d4d3bcbbc47d1e5c Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Tue, 19 Dec 2023 09:28:44 -0500 Subject: [PATCH 183/432] Workaround for ROCm 6.0 failing to compile with AVX2 SIMD support --- simd/src/Kokkos_SIMD_AVX2.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 5f792751303..14eefe5fe20 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -30,9 +30,11 @@ "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" #endif -// FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here. -#if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \ - ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7)) +// FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used here. +#if defined(__HIPCC__) && \ + (((HIP_VERSION_MAJOR == 5) && \ + ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))) || \ + ((HIP_VERSION_MAJOR == 6) && ((HIP_VERSION_MINOR == 0)))) #define KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE #endif @@ -1059,7 +1061,8 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. #ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); #else From 52e44d6cfe35dcfc55c3d91ddff70cc63f8d514c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 20 Dec 2023 18:30:52 -0500 Subject: [PATCH 184/432] SYCL: Force inlining of Kokkos::printf (#6650) * SYCL: Force inlining of Kokkos::printf * Always force inline Kokkos::printf --- core/src/Kokkos_Printf.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Printf.hpp b/core/src/Kokkos_Printf.hpp index 39f95825c38..af20221a5ad 100644 --- a/core/src/Kokkos_Printf.hpp +++ b/core/src/Kokkos_Printf.hpp @@ -31,7 +31,7 @@ namespace Kokkos { // backends. The GPU backends always return 1 and NVHPC only compiles if we // don't ask for the return value. template -KOKKOS_FUNCTION void printf(const char* format, Args... args) { +KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { #ifdef KOKKOS_ENABLE_SYCL // Some compilers warn if "args" is empty and format is not a string literal if constexpr (sizeof...(Args) == 0) From 79164a43adc5da6e3eee12f3cfc66e3e1402ed3f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 6 Dec 2023 19:51:55 +0000 Subject: [PATCH 185/432] Improve handling of printf in OMPT on Intel GPUs --- core/src/Kokkos_Printf.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_Printf.hpp b/core/src/Kokkos_Printf.hpp index af20221a5ad..63a4cce2aeb 100644 --- a/core/src/Kokkos_Printf.hpp +++ b/core/src/Kokkos_Printf.hpp @@ -30,6 +30,9 @@ namespace Kokkos { // In contrast to std::printf, return void to get a consistent behavior across // backends. The GPU backends always return 1 and NVHPC only compiles if we // don't ask for the return value. +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) +using ::printf; +#else template KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { #ifdef KOKKOS_ENABLE_SYCL @@ -39,15 +42,13 @@ KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { else sycl::ext::oneapi::experimental::printf(format, args...); #else - if constexpr (sizeof...(Args) == 0) ::printf("%s", format); - // FIXME_OPENMPTARGET non-string-literal argument used in printf is not - // supported for spir64 -#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)) + if constexpr (sizeof...(Args) == 0) + ::printf("%s", format); else ::printf(format, args...); #endif -#endif } +#endif } // namespace Kokkos From cbbe09b93db49e081ab4d400975aca8c007c4223 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Fri, 22 Dec 2023 08:56:26 -0800 Subject: [PATCH 186/432] OpenMP: Use `omp_get_nested` for older gcc versions (#6685) * OpenMP: fix for issue 6670. * OpenMP: Update gcc version when using max_active_level. * OpenMP: Edit execute_in_serial. * Apply suggestions from code review Co-authored-by: Daniel Arndt * clang-format --------- Co-authored-by: Rahulkumar Gayatri Co-authored-by: Damien L-G Co-authored-by: Daniel Arndt --- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index c0eb1c9f4d9..f4b753593d4 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -45,13 +45,15 @@ namespace Kokkos { namespace Impl { inline bool execute_in_serial(OpenMP const& space = OpenMP()) { - return (OpenMP::in_parallel(space) && !( -#if _OPENMP >= 201511 - (omp_get_max_active_levels() > 1) +// The default value returned by `omp_get_max_active_levels` with gcc version +// lower than 11.1.0 is 2147483647 instead of 1. +#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \ + _OPENMP >= 201511 + bool is_nested = omp_get_max_active_levels() > 1; #else - omp_get_nested() + bool is_nested = static_cast(omp_get_nested()); #endif - && (omp_get_level() == 1))); + return (OpenMP::in_parallel(space) && !(is_nested && (omp_get_level() == 1))); } } // namespace Impl From 8de16ea357136aeaa0f06a27d24820d3d2f37235 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 19 Dec 2023 19:27:31 +0000 Subject: [PATCH 187/432] Disable more Bessel tests for SYCL on INtel GPUs --- .../TestMathematicalSpecialFunctions.hpp | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp index 06c84c75137..b90055fd71e 100644 --- a/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -1213,13 +1213,13 @@ struct TestComplexBesselI0K0Function { } EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0)); - int upper_limit = N; + int upper_limit_0 = N; // FIXME_SYCL Failing for Intel GPUs, 19 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 19; + upper_limit_0 = 19; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_0; i++) { EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)), Kokkos::abs(h_ref_cbk0(i)) * 1e-13) << "at index " << i; @@ -1462,13 +1462,13 @@ struct TestComplexBesselI1K1Function { } EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0)); - int upper_limit = N; + int upper_limit_1 = N; // FIXME_SYCL Failing for Intel GPUs, 8 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 8; + upper_limit_1 = 8; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_1; i++) { EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)), Kokkos::abs(h_ref_cbk1(i)) * 1e-13) << "at index " << i; @@ -1718,20 +1718,26 @@ struct TestComplexBesselH1Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch10(0), h_ch10(0)); - for (int i = 1; i < N; i++) { + int upper_limit_10 = N; +// FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_10 = 17; +#endif + for (int i = 1; i < upper_limit_10; i++) { EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)), Kokkos::abs(h_ref_ch10(i)) * 1e-13) << "at index " << i; } EXPECT_EQ(h_ref_ch11(0), h_ch11(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case + int upper_limit_11 = N; + // FIXME_SYCL Failing for Intel GPUs, 2 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 16; + upper_limit_11 = 2; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_11; i++) { EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)), Kokkos::abs(h_ref_ch11(i)) * 1e-13) << "at index " << i; @@ -1912,19 +1918,26 @@ struct TestComplexBesselH2Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch20(0), h_ch20(0)); - for (int i = 1; i < N; i++) { + int upper_limit_20 = N; +// FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_20 = 16; +#endif + for (int i = 1; i < upper_limit_20; i++) { EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)), - Kokkos::abs(h_ref_ch20(i)) * 1e-13); + Kokkos::abs(h_ref_ch20(i)) * 1e-13) + << "at index " << i; } EXPECT_EQ(h_ref_ch21(0), h_ch21(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case + int upper_limit_21 = N; + // FIXME_SYCL Failing for Intel GPUs, 1 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 17; + upper_limit_21 = 1; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_21; i++) { EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)), Kokkos::abs(h_ref_ch21(i)) * 1e-13) << "at index " << i; From 391daefd51480fcf674d4c891e9296c1eb8e0e55 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 8 Dec 2023 17:12:23 -0500 Subject: [PATCH 188/432] fill_random without exceution space instance should fence --- algorithms/src/Kokkos_Random.hpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/algorithms/src/Kokkos_Random.hpp b/algorithms/src/Kokkos_Random.hpp index 89126609885..7df12b8518e 100644 --- a/algorithms/src/Kokkos_Random.hpp +++ b/algorithms/src/Kokkos_Random.hpp @@ -1543,13 +1543,23 @@ template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin, typename ViewType::const_value_type end) { - fill_random(typename ViewType::execution_space{}, a, g, begin, end); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, begin, end); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { - fill_random(typename ViewType::execution_space{}, a, g, 0, range); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, 0, range); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } } // namespace Kokkos From 5aa0ceee4b0132ec411f7fc1e11b4b9a1039cde3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 1 Jan 2024 22:36:53 -0500 Subject: [PATCH 189/432] Drop unnecessary guarding for a tool library being loaded in ProfilingSection --- core/src/Kokkos_Profiling_ProfileSection.hpp | 22 ++++---------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/core/src/Kokkos_Profiling_ProfileSection.hpp b/core/src/Kokkos_Profiling_ProfileSection.hpp index 29a04ac3b07..7975b70f316 100644 --- a/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -36,28 +36,14 @@ class ProfilingSection { ProfilingSection& operator=(ProfilingSection const&) = delete; ProfilingSection(const std::string& sectionName) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::createProfileSection(sectionName, &secID); - } + Kokkos::Profiling::createProfileSection(sectionName, &secID); } - void start() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::startSection(secID); - } - } + void start() { Kokkos::Profiling::startSection(secID); } - void stop() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::stopSection(secID); - } - } + void stop() { Kokkos::Profiling::stopSection(secID); } - ~ProfilingSection() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::destroyProfileSection(secID); - } - } + ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(secID); } protected: uint32_t secID; From 73c75075536f905818560f0bc415348b462fe38f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 1 Jan 2024 22:37:35 -0500 Subject: [PATCH 190/432] Drop unnecessary header include in Kokkos_Profiling_ProfileSection.hpp --- core/src/Kokkos_Profiling_ProfileSection.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Kokkos_Profiling_ProfileSection.hpp b/core/src/Kokkos_Profiling_ProfileSection.hpp index 7975b70f316..d8a32aabd32 100644 --- a/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -22,7 +22,6 @@ #endif #include -#include #include #include From 02b46c09c75697da5a1e4bfed5429a56c1bfb7ef Mon Sep 17 00:00:00 2001 From: Phil Miller - NOAA Date: Wed, 3 Jan 2024 10:28:48 -0800 Subject: [PATCH 191/432] #5333: CUDA: Use scratch space appropriate to small reduction elements in Team reductions (#5334) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * #5333: Add maybe failing test case? * Revise test to try to target the right code * NOMERGE #5333: Partial implementation of fix * #5333: Update tests * #5333: Fix comparision types in reducers test * #5333: Re-enable tests * #5333: Add test for Scalar with size of 1 * Fix formatting * Remove unrelated changes * Extract TeamPolicy tests into a separate method * Disable new tests (OpenACC, OpenMPTarget, Serial, SYCL, point_t) * Use smaller number of teams for bhalf_t * Use pointer_type for m_scratch_space parameter * Skip failing tests (SYCL) * Fix tests for OpenMPTarget * Fix num_teams_done variable in SYCL TeamPolicy reduction * Restore guards for array_reduce * Also make array_reduce work for SYCL * tests: change point_t default constructor --------- Co-authored-by: Arkadiusz Szczepkowicz Co-authored-by: Cezary Skrzyński Co-authored-by: Daniel Arndt --- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 57 +++++++++---- core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 6 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 16 ++-- core/src/SYCL/Kokkos_SYCL_Team.hpp | 7 +- core/unit_test/TestNonTrivialScalarTypes.hpp | 2 +- core/unit_test/TestReducers.hpp | 83 +++++++++++++++++++ core/unit_test/TestReducers_d.hpp | 14 ++++ 7 files changed, 153 insertions(+), 32 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 6724c91fcbc..23dc569ab15 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -625,6 +625,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::Cuda::size_type), + std::conditional_t, + Kokkos::Cuda::size_type>; using size_type = Cuda::size_type; using reducer_type = ReducerType; @@ -648,9 +664,11 @@ class ParallelReduce + const integral_nonzero_constant word_count(m_functor_reducer.get_reducer().value_size() / - sizeof(size_type)); + sizeof(word_size_type)); reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + + kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value); // Iterate this block through the league @@ -723,18 +742,19 @@ class ParallelReduce( m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory(), m_scratch_space, + kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); if (do_final_reduction) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; + word_size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) + ? reinterpret_cast(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -784,7 +804,8 @@ class ParallelReduce(m_scratch_space), result, + m_scratch_flags, blockDim.y)) { const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { m_functor_reducer.get_reducer().final(&value); @@ -805,13 +826,15 @@ class ParallelReduce(cuda_internal_scratch_space( + m_policy.space(), + m_functor_reducer.get_reducer().value_size() * block_count)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), m_functor_reducer.get_reducer().value_size()); + m_unified_space = + reinterpret_cast(cuda_internal_scratch_unified( + m_policy.space(), m_functor_reducer.get_reducer().value_size())); dim3 block(m_vector_size, m_team_size, 1); dim3 grid(block_count, 1, 1); diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 3c0ade365ab..3037c4ab541 100644 --- a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -103,7 +103,7 @@ template __device__ bool cuda_inter_block_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, const FunctorType& reducer, - Cuda::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, Cuda::size_type* const m_scratch_flags, const int max_active_thread = blockDim.y) { @@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x; + pointer_type global = m_scratch_space + blockIdx.x; *global = value; } @@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction( last_block = true; value = neutral; - pointer_type const volatile global = (pointer_type)m_scratch_space; + pointer_type const volatile global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index a0aa123c934..31dcbe77421 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -170,6 +170,7 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, @@ -186,8 +187,6 @@ class Kokkos::Impl::ParallelReduce( - local_mem[wgroup_size * std::max(value_count, 1u)]); const auto local_id = item.get_local_linear_id(); const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -225,10 +224,10 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } sycl::group_barrier(item.get_group()); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { if (local_id >= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -277,10 +276,10 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } item.barrier(sycl::access::fence_space::local_space); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { if (local_id >= n_wgroups) reducer.init(&local_value); else { @@ -324,10 +323,7 @@ class Kokkos::Impl::ParallelReduce local_mem( - sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + - (sizeof(unsigned int) + sizeof(value_type) - 1) / - sizeof(value_type), - cgh); + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); const auto init_size = std::max((size + wgroup_size - 1) / wgroup_size, 1); diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 89c09c3195f..dbba3827581 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -140,9 +140,14 @@ class SYCLTeamMember { } value = sg.shuffle(value, 0); + const auto n_subgroups = sg.get_group_range()[0]; + if (n_subgroups == 1) { + reducer.reference() = value; + return; + } + // We need to chunk up the whole reduction because we might not have // allocated enough memory. - const auto n_subgroups = sg.get_group_range()[0]; const unsigned int maximum_work_range = std::min(m_team_reduce_size / sizeof(value_type), n_subgroups); diff --git a/core/unit_test/TestNonTrivialScalarTypes.hpp b/core/unit_test/TestNonTrivialScalarTypes.hpp index eaf7a4125cc..116ac58c39f 100644 --- a/core/unit_test/TestNonTrivialScalarTypes.hpp +++ b/core/unit_test/TestNonTrivialScalarTypes.hpp @@ -214,7 +214,7 @@ struct point_t { uint8_t x, y, z; KOKKOS_FUNCTION - point_t() : x(1), y(1), z(1){}; + point_t() : x(0), y(0), z(0){}; KOKKOS_FUNCTION point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){}; diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index 957b9a0ca1a..f710c40d3a3 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -19,6 +19,7 @@ #include #include +#include //-------------------------------------------------------------------------- @@ -46,6 +47,15 @@ struct TestReducers { void operator()(const int& i, Scalar& value) const { value += values(i); } }; + struct TeamSumFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m, Scalar& value) const { + if (m.team_rank() == m.team_size() - 1) value += Scalar(1); + } + }; + struct ProdFunctor { Kokkos::View values; @@ -319,6 +329,77 @@ struct TestReducers { value = value || values(i); } }; + + // get number of teams for TeamPolicy depending on the tested type + constexpr static int get_num_teams() { + if constexpr (sizeof(Scalar) == 1) { + return 126; + } else if constexpr (std::is_same_v) { + return 256; + } + + return 1024; + } + + static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) { +#ifdef KOKKOS_ENABLE_OPENACC + if constexpr (std::is_same_v && + (std::is_same_v || + std::is_same_v)) { + return; // FIXME_OPENACC + } +#endif + + using member_type = typename Kokkos::TeamPolicy::member_type; + + Scalar sum_scalar; + Kokkos::View sum_view("result"); + Kokkos::deep_copy(sum_view, Scalar(1)); + + constexpr int num_teams = get_num_teams(); + TeamSumFunctor tf; +#ifdef KOKKOS_ENABLE_OPENMPTARGET + auto team_pol = Kokkos::TeamPolicy(num_teams, Kokkos::AUTO); +#else + auto team_pol = Kokkos::TeamPolicy(num_teams, 1); +#endif + Kokkos::parallel_reduce(team_pol, tf, sum_view); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams; + + Kokkos::parallel_for( +#ifdef KOKKOS_ENABLE_OPENMPTARGET + Kokkos::TeamPolicy(1, Kokkos::AUTO), +#else + Kokkos::TeamPolicy(1, 1), +#endif + KOKKOS_LAMBDA(member_type team_member) { + Scalar local_scalar; + Kokkos::Sum reducer_scalar( + local_scalar); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, 0), f, + reducer_scalar); + sum_view() = local_scalar; + }); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, Scalar{0}) << "N: " << N; + + auto team_size = std::min(128, TEST_EXECSPACE().concurrency()); + Kokkos::parallel_for( + Kokkos::TeamPolicy(10, team_size), + KOKKOS_LAMBDA(member_type team_member) { + Scalar local_scalar; + Kokkos::Sum reducer_scalar( + local_scalar); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, N), f, + reducer_scalar); + sum_view() = local_scalar; + }); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, reference_sum) << "N: " << N; + } + static void test_sum(int N) { Kokkos::View values("Values", N); auto h_values = Kokkos::create_mirror_view(values); @@ -374,6 +455,8 @@ struct TestReducers { ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N; } + test_sum_team_policy(N, f, reference_sum); + { Kokkos::View sum_view("View"); sum_view() = Scalar(1); diff --git a/core/unit_test/TestReducers_d.hpp b/core/unit_test/TestReducers_d.hpp index 19eaa6d7000..ecf851aa108 100644 --- a/core/unit_test/TestReducers_d.hpp +++ b/core/unit_test/TestReducers_d.hpp @@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) { TestReducers::test_prod(4); } +TEST(TEST_CATEGORY, reducers_int16_t) { + using ThisTestType = int16_t; + + TestReducers::test_sum(1); + TestReducers::test_sum(2); + TestReducers::test_sum(3); + TestReducers::test_sum(4); + + TestReducers::test_prod(1); + TestReducers::test_prod(2); + TestReducers::test_prod(3); + TestReducers::test_prod(4); +} + #if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET) // TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to // implicitly-deleted default constructor of 'conv_type' From 4078a0d8a022ab1ad2291554b2820916fa3bc361 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 3 Jan 2024 13:54:16 -0500 Subject: [PATCH 192/432] Cuda: Allocate using the correct device (#6392) * Cuda: Allocate using the correct device * Avoid warning about uninitialized variable * exec_space_provided -> stream_sync_only * Fix up ASYNC support * Only check for errors in synchronization if allocation was successful * Rename arguments and make constructor private --------- Co-authored-by: Daniel Arndt --- core/src/Cuda/Kokkos_CudaSpace.cpp | 109 ++++++++++++------------- core/src/Cuda/Kokkos_CudaSpace.hpp | 36 +++++++- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 26 +++--- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 2 +- 4 files changed, 103 insertions(+), 70 deletions(-) diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index c6512f44dad..9cd074df4fd 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -135,11 +135,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) { namespace Kokkos { -CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {} - -CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {} - -CudaHostPinnedSpace::CudaHostPinnedSpace() {} +CudaSpace::CudaSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaSpace::CudaSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} + +CudaUVMSpace::CudaUVMSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} + +CudaHostPinnedSpace::CudaHostPinnedSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} size_t memory_threshold_g = 40000; // 40 kB @@ -161,52 +173,38 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, } namespace { -void *impl_allocate_common(const Cuda &exec_space, const char *arg_label, - const size_t arg_alloc_size, +void *impl_allocate_common(const int device_id, + [[maybe_unused]] const cudaStream_t stream, + const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, - bool exec_space_provided) { + [[maybe_unused]] bool stream_sync_only) { void *ptr = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + cudaError_t error_code = cudaSuccess; #ifndef CUDART_VERSION #error CUDART_VERSION undefined! #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - cudaError_t error_code; if (arg_alloc_size >= memory_threshold_g) { - if (exec_space_provided) { - error_code = - exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - exec_space.fence("Kokkos::Cuda: backend fence after async malloc"); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence after async malloc"); + error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); + + if (error_code == cudaSuccess) { + if (stream_sync_only) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + } else { + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async malloc"); + } } - } else { - error_code = - (exec_space_provided - ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size) - : Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size)); - } -#else - cudaError_t error_code; - if (exec_space_provided) { - error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } + } else #endif + { error_code = cudaMalloc(&ptr, arg_alloc_size); } if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: @@ -226,7 +224,7 @@ void *CudaSpace::impl_allocate( const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size, + return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size, arg_logical_size, arg_handle, false); } @@ -234,8 +232,9 @@ void *CudaSpace::impl_allocate( const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(exec_space, arg_label, arg_alloc_size, - arg_logical_size, arg_handle, true); + return impl_allocate_common( + exec_space.cuda_device(), exec_space.cuda_stream(), arg_label, + arg_alloc_size, arg_logical_size, arg_handle, true); } void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { @@ -256,28 +255,27 @@ void *CudaUVMSpace::impl_allocate( if (arg_alloc_size > 0) { Kokkos::Impl::num_uvm_allocations++; - auto error_code = - Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper( - &ptr, arg_alloc_size, cudaMemAttachGlobal); - -#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST - if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper( - ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, - cudaCpuDeviceId))); -#endif + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: CudaMallocManaged); } + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, + cudaCpuDeviceId)); +#endif } Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation"); @@ -302,13 +300,14 @@ void *CudaHostPinnedSpace::impl_allocate( const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; - auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper( - &ptr, arg_alloc_size, cudaHostAllocDefault); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp index 058636b07fe..4a220dd6450 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -68,6 +68,11 @@ class CudaSpace { /*--------------------------------*/ CudaSpace(); + + private: + CudaSpace(int device_id, cudaStream_t stream); + + public: CudaSpace(CudaSpace&& rhs) = default; CudaSpace(const CudaSpace& rhs) = default; CudaSpace& operator=(CudaSpace&& rhs) = default; @@ -89,6 +94,10 @@ class CudaSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaSpace impl_create(int device_id, cudaStream_t stream) { + return CudaSpace(device_id, stream); + } + private: void* impl_allocate(const Cuda& exec_space, const char* arg_label, const size_t arg_alloc_size, @@ -110,7 +119,8 @@ class CudaSpace { static constexpr const char* name() { return m_name; } private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; static constexpr const char* m_name = "Cuda"; friend class Kokkos::Impl::SharedAllocationRecord; @@ -147,6 +157,11 @@ class CudaUVMSpace { /*--------------------------------*/ CudaUVMSpace(); + + private: + CudaUVMSpace(int device_id, cudaStream_t stream); + + public: CudaUVMSpace(CudaUVMSpace&& rhs) = default; CudaUVMSpace(const CudaUVMSpace& rhs) = default; CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; @@ -185,8 +200,13 @@ class CudaUVMSpace { #endif /*--------------------------------*/ + static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) { + return CudaUVMSpace(device_id, stream); + } + private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST static bool kokkos_impl_cuda_pin_uvm_to_host_v; @@ -219,6 +239,11 @@ class CudaHostPinnedSpace { /*--------------------------------*/ CudaHostPinnedSpace(); + + private: + CudaHostPinnedSpace(int device_id, cudaStream_t stream); + + public: CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; @@ -236,6 +261,10 @@ class CudaHostPinnedSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) { + return CudaHostPinnedSpace(device_id, stream); + } + private: void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -252,6 +281,9 @@ class CudaHostPinnedSpace { static constexpr const char* name() { return m_name; } private: + int m_device; + cudaStream_t m_stream; + static constexpr const char* m_name = "CudaHostPinned"; /*--------------------------------*/ diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 6d0f0707d82..804505d0d74 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -#include -#include -#include -#include +//#include +//#include +//#include +//#include #include #include #include @@ -454,8 +454,9 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size); + Record *const r = + Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream), + "Kokkos::InternalScratchFlags", alloc_size); Record::increment(r); @@ -480,8 +481,9 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size); + Record *const r = + Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream), + "Kokkos::InternalScratchSpace", alloc_size); Record::increment(r); @@ -505,7 +507,7 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { std::size_t alloc_size = multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain); Record *const r = - Record::allocate(Kokkos::CudaHostPinnedSpace(), + Record::allocate(CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream), "Kokkos::InternalScratchUnified", alloc_size); Record::increment(r); @@ -526,9 +528,9 @@ Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const { if (m_scratchFunctor) Record::decrement(Record::get_record(m_scratchFunctor)); - Record *const r = - Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); + Record *const r = Record::allocate( + CudaSpace::impl_create(m_cudaDev, m_stream), + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize); Record::increment(r); diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 4d6c915b9e9..29e4e3f9679 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -22,7 +22,7 @@ #include #include #include - +#include "Kokkos_CudaSpace.hpp" //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // These functions fulfill the purpose of allowing to work around From f485cfa53cb2d25cab98e7a1d41a116027ec2363 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jan 2024 15:26:03 -0500 Subject: [PATCH 193/432] Let `Profiling::ProfilingSection(std::string)` constructor be explicit and nodiscard (#6690) * Add nodiscard attribute to Profiling::ProfilingSection * Make Profiling::ProfilingSection constructor explicit --- core/src/Kokkos_Profiling_ProfileSection.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_Profiling_ProfileSection.hpp b/core/src/Kokkos_Profiling_ProfileSection.hpp index d8a32aabd32..1a87811480d 100644 --- a/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -29,12 +29,15 @@ namespace Kokkos { namespace Profiling { -class ProfilingSection { +class [[nodiscard]] ProfilingSection { public: ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; - ProfilingSection(const std::string& sectionName) { +#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 + [[nodiscard]] +#endif + explicit ProfilingSection(const std::string& sectionName) { Kokkos::Profiling::createProfileSection(sectionName, &secID); } From 4eae6a99f1123f31496504f60cb6a4735559052c Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 2 Jan 2024 16:55:35 -0500 Subject: [PATCH 194/432] Cosmetic changes to ProfilingSection --- core/src/Kokkos_Profiling_ProfileSection.hpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/core/src/Kokkos_Profiling_ProfileSection.hpp b/core/src/Kokkos_Profiling_ProfileSection.hpp index 1a87811480d..e7a9ba0c7ed 100644 --- a/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -26,10 +26,11 @@ #include -namespace Kokkos { -namespace Profiling { +namespace Kokkos::Profiling { class [[nodiscard]] ProfilingSection { + uint32_t sectionID; + public: ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; @@ -38,21 +39,17 @@ class [[nodiscard]] ProfilingSection { [[nodiscard]] #endif explicit ProfilingSection(const std::string& sectionName) { - Kokkos::Profiling::createProfileSection(sectionName, &secID); + Kokkos::Profiling::createProfileSection(sectionName, §ionID); } - void start() { Kokkos::Profiling::startSection(secID); } - - void stop() { Kokkos::Profiling::stopSection(secID); } + void start() { Kokkos::Profiling::startSection(sectionID); } - ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(secID); } + void stop() { Kokkos::Profiling::stopSection(sectionID); } - protected: - uint32_t secID; + ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(sectionID); } }; -} // namespace Profiling -} // namespace Kokkos +} // namespace Kokkos::Profiling #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE #undef KOKKOS_IMPL_PUBLIC_INCLUDE From 654a51f60cef8900868f55a93661cee80f451cd9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 4 Jan 2024 09:44:12 -0500 Subject: [PATCH 195/432] GitHub CI: Test with AddressSanitizer (#6676) * GitHub CI: Test with AddressSanitizer * Disable view_allocation_error when running with AddressSanitizer * Try __attribute__((no_sanitize(address))) * Revert "Try __attribute__((no_sanitize(address)))" This reverts commit 335f7f26ae96034ceae221e5a80bd93b4769ed1d. --- .github/workflows/continuous-integration-workflow.yml | 11 +++++++++++ core/unit_test/TestViewAPI_d.hpp | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 6446cbacd9b..645cdce83cf 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -47,9 +47,17 @@ jobs: backend: 'OPENMP' - distro: 'ubuntu:latest' cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address' + extra_linker_flags: '-fsanitize=address' cmake_build_type: 'RelWithDebInfo' backend: 'THREADS' clang-tidy: '-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*"' + - distro: 'ubuntu:latest' + cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address' + extra_linker_flags: '-fsanitize=address' + cmake_build_type: 'RelWithDebInfo' + backend: 'SERIAL' - distro: 'ubuntu:latest' cxx: 'g++' cmake_build_type: 'RelWithDebInfo' @@ -106,6 +114,7 @@ jobs: -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_IMPL_MDSPAN=ON \ -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_extra_flags }}" \ + -DCMAKE_EXE_LINKER_FLAGS="${{ matrix.extra_linker_flags }}" \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} @@ -118,6 +127,7 @@ jobs: working-directory: builddir run: ctest --output-on-failure - name: Test linking against build dir + if: ${{ matrix.cxx_extra_flags != '-fsanitize=address' }} working-directory: example/build_cmake_installed run: | cmake -B builddir_buildtree -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DKokkos_ROOT=../../builddir @@ -128,6 +138,7 @@ jobs: - name: Install run: sudo cmake --build builddir --target install - name: Test install + if: ${{ matrix.cxx_extra_flags != '-fsanitize=address' }} working-directory: example/build_cmake_installed run: | cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} diff --git a/core/unit_test/TestViewAPI_d.hpp b/core/unit_test/TestViewAPI_d.hpp index 08d21f54499..b44335279b3 100644 --- a/core/unit_test/TestViewAPI_d.hpp +++ b/core/unit_test/TestViewAPI_d.hpp @@ -27,6 +27,12 @@ TEST(TEST_CATEGORY, view_api_d) { } TEST(TEST_CATEGORY, view_allocation_error) { +#if defined(__has_feature) +#if __has_feature(address_sanitizer) + GTEST_SKIP() << "AddressSanitzer detects allocating too much memory " + "preventing our checks to run"; +#endif +#endif #if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; #endif From efc0c365c0b61c63a47712656788d2d30ab16d4b Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Thu, 4 Jan 2024 08:46:25 -0600 Subject: [PATCH 196/432] Kokkos::Array deduction guide (#6373) * Added a deduction guide for Kokkos::Array * Added unit test for Kokkos::Array deduction guide * Decorated is_equal with KOKKOS_FUNCTION --- core/src/Kokkos_Array.hpp | 3 +++ core/unit_test/TestArray.cpp | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 82ceaaec218..7773d20339a 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -320,6 +320,9 @@ struct Array::strided> { : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +template +Array(T, Us...)->Array; + } // namespace Kokkos // diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index d3bdc4f93f7..76024b72f5f 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -49,4 +49,25 @@ KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() { static_assert(test_array_structured_binding_support()); +template +KOKKOS_FUNCTION constexpr bool is_equal(L const& l, R const& r) { + if (std::size(l) != std::size(r)) return false; + + for (size_t i = 0; i != std::size(l); ++i) { + if (l[i] != r[i]) return false; + } + + return true; +} + +KOKKOS_FUNCTION constexpr bool test_array_ctad() { + constexpr int x = 10; + constexpr Kokkos::Array a{1, 2, 3, 5, x}; + constexpr Kokkos::Array b{1, 2, 3, 5, x}; + + return std::is_same_v && is_equal(a, b); +} + +static_assert(test_array_ctad()); + } // namespace From 06de563f9958c58a6f292d742d8f658d67600d7c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 4 Jan 2024 11:34:30 -0500 Subject: [PATCH 197/432] Add CI for MSVC+Cuda (#6661) --- .github/workflows/windows.yml | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .github/workflows/windows.yml diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 00000000000..e66c6cb99df --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,36 @@ +name: github-windows + +on: + push: + pull_request: + +concurrency: + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +permissions: + contents: read + +jobs: + windows-cuda: + # Cuda build on Windows + name: Windows Cuda + runs-on: windows-2022 + + steps: + - uses: Jimver/cuda-toolkit@v0.2.11 + id: cuda-toolkit + with: + cuda: '12.1.0' + - uses: actions/checkout@v4 + - name: configure + shell: bash + run: | + mkdir build + mkdir c:/project + cd build + cmake -DKokkos_ENABLE_CUDA=ON -DKokkos_ARCH_VOLTA70=ON -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE=ON .. + - name: build library + shell: bash + run: | + cmake --build build --parallel 2 --config Release From 9f5e38e976f1c9003d5895803e5d5ce03015dbbb Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 4 Jan 2024 11:42:37 -0500 Subject: [PATCH 198/432] SYCL: Address deprecations after oneAPI 2023.2.0 (#6577) * Address deprecations in oneAPI 2023.2.0 * Define KOKKOS_IMPL_SYCL_GET_MULTI_PTR --- core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 3 ++- core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 15 ++++++++------- core/src/setup/Kokkos_Setup_SYCL.hpp | 7 +++++++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index f8abdf8443d..ecb4a863da2 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -76,7 +76,8 @@ class Kokkos::Impl::ParallelFor, auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), shmem_begin, + scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item, item.get_group_linear_id(), item.get_group_range(1)); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 31dcbe77421..01819b848af 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -131,9 +131,10 @@ class Kokkos::Impl::ParallelReduce) functor(team_member, update); else @@ -200,8 +201,8 @@ class Kokkos::Impl::ParallelReduce #endif +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20230200 +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) \ + accessor.get_multi_ptr() +#else +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() +#endif + #endif From cbf1c644c287a9a1f3c4220b7dbe196b7aba0b44 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 4 Jan 2024 14:58:20 -0500 Subject: [PATCH 199/432] Fixup cast tolerance to double before printing --- core/unit_test/TestMathematicalFunctions.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 1c1cd4f8e69..ad035d4e4bf 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -293,7 +293,7 @@ struct FloatingPointComparison { bool ar = absolute(fpv) < abs_tol; if (!ar) { Kokkos::printf("absolute value exceeds tolerance [|%e| > %e]\n", - (double)fpv, abs_tol); + (double)fpv, (double)abs_tol); } return ar; @@ -314,7 +314,7 @@ struct FloatingPointComparison { bool ar = abs_diff == 0 || rel_diff < rel_tol; if (!ar) { Kokkos::printf("relative difference exceeds tolerance [%e > %e]\n", - (double)rel_diff, rel_tol); + (double)rel_diff, (double)rel_tol); } return ar; From 3358970c2fe7c36e6de79912edd2a65f5a46b53f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 4 Jan 2024 15:20:05 -0500 Subject: [PATCH 200/432] Try linking against CUDA libararies even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE --- cmake/kokkos_tpls.cmake | 4 +--- core/src/CMakeLists.txt | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index f124596a84e..f8cb90d6dcf 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -77,9 +77,7 @@ KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake KOKKOS_IMPORT_TPL(HPX INTERFACE) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(CUDA INTERFACE) -ENDIF() +KOKKOS_IMPORT_TPL(CUDA INTERFACE) KOKKOS_IMPORT_TPL(HWLOC) KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index b4a25c0813e..1943f7e5485 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -187,9 +187,7 @@ ENDIF() KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -ENDIF() +KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) From 716bef2a4c8025076b2893e9faaf4330b716041d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 4 Jan 2024 15:53:14 -0700 Subject: [PATCH 201/432] test_array_ctad: disable test for intel versions < 2021 Address issue #6702 --- core/unit_test/TestArray.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index 76024b72f5f..673d0036b71 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -60,6 +60,8 @@ KOKKOS_FUNCTION constexpr bool is_equal(L const& l, R const& r) { return true; } +// Disable ctad test for intel versions < 2021, see issue #6702 +#if !defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021 KOKKOS_FUNCTION constexpr bool test_array_ctad() { constexpr int x = 10; constexpr Kokkos::Array a{1, 2, 3, 5, x}; @@ -69,5 +71,6 @@ KOKKOS_FUNCTION constexpr bool test_array_ctad() { } static_assert(test_array_ctad()); +#endif } // namespace From 3523bc3e789110445320af86e1feea6862b31b68 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Fri, 5 Jan 2024 09:11:17 -0500 Subject: [PATCH 202/432] Enable `{transform_}exclusive_scan` in place (#6667) * Enable exclusive_scan in place * add test for inplace operation * improve tests * formatting * fix test for transform_* * simplify args * revise tests --------- Co-authored-by: Francesco Rizzi --- .../impl/Kokkos_FunctorsForExclusiveScan.hpp | 9 +- .../TestStdAlgorithmsExclusiveScan.cpp | 228 +++++++++--------- .../TestStdAlgorithmsTeamExclusiveScan.cpp | 39 ++- ...tdAlgorithmsTeamTransformExclusiveScan.cpp | 41 +++- ...estStdAlgorithmsTransformExclusiveScan.cpp | 93 +++++-- 5 files changed, 252 insertions(+), 158 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp index 8151ee34955..5a7fe16984a 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp @@ -47,8 +47,9 @@ struct ExclusiveScanDefaultFunctorForKnownNeutralElement { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = m_first_from[i]; if (final_pass) m_first_dest[i] = update + m_init_value; - update += m_first_from[i]; + update += tmp; } }; @@ -73,6 +74,7 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_first_from[i], false}; if (final_pass) { if (i == 0) { m_first_dest[i] = m_init_value; @@ -81,7 +83,6 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { } } - const auto tmp = value_type{m_first_from[i], false}; this->join(update, tmp); } @@ -132,6 +133,7 @@ struct TransformExclusiveScanFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -142,7 +144,6 @@ struct TransformExclusiveScanFunctorWithValueWrapper { } } - const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; this->join(update, tmp); } @@ -190,6 +191,7 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -200,7 +202,6 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { } } - const auto tmp = ValueType{m_unary_op(m_first_from[i])}; this->join(update, tmp); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 56387e3c92b..b364c53a888 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -133,47 +133,6 @@ void my_host_exclusive_scan(it1 first, it1 last, it2 dest, ValType init, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - ValueType init_value, BinaryOp bop) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), init_value, bop); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - if (test_view_h.extent(0) > 0) { - for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -190,107 +149,153 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value, BinaryOp bop) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), init_value, bop); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + if (test_view_h.extent(0) > 0) { + for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value) { + (*this)(data_view, test_view, init_value, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info, - ValueType init_value) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value); + auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value, + empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, - ValueType init_value, BinaryOp bop) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); - fill_view(view_from, name); + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view1"); + fill_view(view1, name); + auto view2 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = - KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } Kokkos::fence(); @@ -304,34 +309,39 @@ void run_exclusive_scan_all_scenarios() { {"medium", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it, ValueType{0}); - run_single_scenario_default_op(it, ValueType{1}); - run_single_scenario_default_op(it, ValueType{-2}); - run_single_scenario_default_op(it, ValueType{3}); + run_single_scenario(it, ValueType{0}); + run_single_scenario(it, ValueType{1}); + run_single_scenario(it, ValueType{-2}); + run_single_scenario(it, ValueType{3}); + + run_single_scenario_inplace(it, ValueType{0}); + run_single_scenario_inplace(it, ValueType{-2}); #if !defined KOKKOS_ENABLE_OPENMPTARGET // custom multiply op is only run for small views otherwise it overflows if (it.first == "small-a" || it.first == "small-b") { using custom_bop_t = MultiplyFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, - custom_bop_t()); - } + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); - using custom_bop_t = SumFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, + run_single_scenario_inplace(it, ValueType{0}, custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, + run_single_scenario_inplace(it, ValueType{-2}, custom_bop_t()); + } + + using custom_bop_t = SumFunctor; + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); + + run_single_scenario_inplace(it, ValueType{0}, + custom_bop_t()); + run_single_scenario_inplace(it, ValueType{-2}, + custom_bop_t()); #endif } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index c6b2566c6cf..4fa4e624db1 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -121,7 +121,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -147,9 +149,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -168,12 +167,19 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -223,11 +229,16 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { @@ -236,7 +247,7 @@ void run_all_scenarios() { #else for (int apiId : {0, 1}) { #endif - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -246,6 +257,10 @@ TEST(std_algorithms_exclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamExclusiveScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp index 9f30812d8ef..60fa369af18 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -108,7 +108,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -134,9 +136,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -156,12 +155,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -200,16 +208,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef transform_exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -219,6 +232,10 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformExclusiveScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index f574832cc63..fa2804256ac 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -161,24 +161,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -206,17 +197,13 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, BinaryOp bop, UnaryOp uop) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "transform_exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - - auto view_dest = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); - auto view_from = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); + + auto view_from = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_from"); fill_view(view_from, name); + auto view_dest = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_dest"); { fill_zero(view_dest); auto r = KE::transform_exclusive_scan( @@ -254,6 +241,65 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, BinaryOp bop, + UnaryOp uop) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view2"); + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan( + "label", exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), view2, view2, init_value, + bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan("label", exespace(), view2, view2, + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -268,6 +314,11 @@ void run_all_scenarios() { run_single_scenario(it, ValueType{1}, bop_t(), uop_t()); run_single_scenario(it, ValueType{-2}, bop_t(), uop_t()); run_single_scenario(it, ValueType{3}, bop_t(), uop_t()); + + run_single_scenario_inplace(it, ValueType{0}, bop_t(), + uop_t()); + run_single_scenario_inplace(it, ValueType{-2}, bop_t(), + uop_t()); } } From 27286c32d7e0c157d1a4d7f00f9441979ed8eabe Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 8 Jan 2024 20:57:55 -0500 Subject: [PATCH 203/432] Add `ATOMICS_BYPASS` configuration option to disable atomics (#6692) * Add NAME_TBD_UNSAFE_ATOMICS configuration option to disable atomics * Rename configuration option and macro NAME_TBD_UNSAFE_ATOMICS -> ATOMICS_BYPASS * Improve option description and configuration time error message Co-Authored-By: Christian Trott Co-Authored-By: Daniel Arndt * Mention Kokkos_ENABLE_ATOMICS_BYPASS option in error msssage * Simplify sanity check for disabling atomics in Co-Authored-By: Daniel Arndt --------- Co-authored-by: Christian Trott Co-authored-by: Daniel Arndt --- cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 11 +++++++++++ cmake/kokkos_enable_options.cmake | 1 + core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp | 2 +- core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 2 +- core/src/Kokkos_Macros.hpp | 11 ++++++----- core/src/Serial/Kokkos_Serial.cpp | 2 +- 7 files changed, 22 insertions(+), 8 deletions(-) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 321678dcf6f..f54475e45aa 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -52,6 +52,7 @@ #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN +#cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index b5a8e38950a..7c834f2b0dd 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -1139,3 +1139,14 @@ MESSAGE(STATUS "Architectures:") FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) MESSAGE(STATUS " ${Arch}") ENDFOREACH() + + +IF(KOKKOS_ENABLE_ATOMICS_BYPASS) + IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") + ENDIF() + IF(NOT KOKKOS_ENABLE_SERIAL) + MESSAGE(FATAL_ERROR "Implementation bug") # safeguard + ENDIF() + MESSAGE(STATUS "Atomics: **DISABLED**") +ENDIF() diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index a36742e4dfb..a437f6132aa 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -73,6 +73,7 @@ KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple ke # This option will go away eventually, but allows fallback to old implementation when needed. KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support") KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") diff --git a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp index 1c434746321..9acacef901a 100644 --- a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp @@ -25,7 +25,7 @@ static_assert(false, #include #include -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index bda37839805..eebdd20f15d 100644 --- a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -49,7 +49,7 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #endif // ============================================================ -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index a77e50b65b3..b255d2a5195 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -84,11 +84,12 @@ //---------------------------------------------------------------------------- -#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ - !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENACC) -#define KOKKOS_INTERNAL_NOT_PARALLEL +#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) && \ + (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) || \ + defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC)) +#error Atomics may only be disabled if neither a host parallel nor a device backend is enabled #endif #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index 071ecdbc4fa..39b201976b5 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -153,7 +153,7 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS os << "Kokkos atomics disabled\n"; #endif From 0e4a158a7585f2a93b93d4121788016fe0dba4c9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 8 Jan 2024 21:06:36 -0500 Subject: [PATCH 204/432] Check matching static extents in View constructor (#5190) * Also check static extents * Use separate bools for the two checks and improve error messages * Improve error messages * Refactor test * Add another test * Extend existing tests * Use make_index_sequence * Remove LIVE and DIE * Try removing guards for OpenMPTarget * Use different style in view_construction_with_wrong_static_extents * Remove template template parameter * Also check unmanaged views * Fix DynRankView * Add FIXME * Pass label by const char * * fix terminating define * Deduce rank and dynamic_rank from View * Only call runtime_check_rank when KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK is defined * Guard with if constexpr --------- Co-authored-by: Francesco Rizzi --- containers/src/Kokkos_DynRankView.hpp | 13 +- core/src/Kokkos_View.hpp | 116 ++++-- core/src/impl/Kokkos_ViewArray.hpp | 6 + core/unit_test/TestViewAPI.hpp | 12 +- core/unit_test/TestViewCtorDimMatch.hpp | 501 +++++++----------------- 5 files changed, 265 insertions(+), 383 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 33d9562ea4f..5fa59f1b7cd 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1653,8 +1653,17 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");) } - return View::type, Args...>( - v.data(), v.impl_map().layout()); + auto layout = v.impl_map().layout(); + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + for (int i = N; i < 7; ++i) + layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; + } + + return View::type, Args...>(v.data(), layout); } template diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 1d2b4b9be2c..7a2e0eb0323 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -75,25 +75,59 @@ constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); } -KOKKOS_INLINE_FUNCTION -void runtime_check_rank(const size_t rank, const size_t dyn_rank, - const bool is_void_spec, const size_t i0, - const size_t i1, const size_t i2, const size_t i3, - const size_t i4, const size_t i5, const size_t i6, - const size_t i7, const std::string& label) { +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { (void)(label); if (is_void_spec) { const size_t num_passed_args = count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } - if (num_passed_args != dyn_rank && num_passed_args != rank) { + if (!n_args_is_dyn_rank && !n_args_is_rank) { KOKKOS_IF_ON_HOST( const std::string message = - "Constructor for Kokkos View '" + label + - "' has mismatched number of arguments. Number of arguments = " + + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + std::to_string(num_passed_args) + - " but dynamic rank = " + std::to_string(dyn_rank) + " \n"; + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; Kokkos::abort(message.c_str());) KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " "mismatched number of arguments.");) @@ -1402,21 +1436,30 @@ class View : public ViewTraits { "execution space"); } - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - rank, rank_dynamic, - std::is_same::value, i0, i1, i2, i3, - i4, i5, i6, i7, alloc_name); +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); @@ -1445,6 +1488,29 @@ class View : public ViewTraits { typename Impl::ViewCtorProp::pointer_type>::value, "Constructing View to wrap user memory must supply matching pointer " "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif } // Simple dimension-only layout diff --git a/core/src/impl/Kokkos_ViewArray.hpp b/core/src/impl/Kokkos_ViewArray.hpp index 93c0fddc447..fe43b630184 100644 --- a/core/src/impl/Kokkos_ViewArray.hpp +++ b/core/src/impl/Kokkos_ViewArray.hpp @@ -129,6 +129,12 @@ class ViewMapping> { return m_impl_offset.m_dim.extent(r); } + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + using dim_type = typename offset_type::dimension_type; + return dim_type::static_extent(r); + } + KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() const { return m_impl_offset.layout(); diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp index 4c27695f6d1..a8492523a1e 100644 --- a/core/unit_test/TestViewAPI.hpp +++ b/core/unit_test/TestViewAPI.hpp @@ -1004,25 +1004,25 @@ class TestViewAPI { hView3 hv_3("dView3::HostMirror", N0); hView4 hv_4("dView4::HostMirror", N0); - dView0 dv_0_1(nullptr, 0); + dView0 dv_0_1(nullptr); dView0 dv_0_2(hv_0.label(), hv_0.layout()); - dView1 dv_1_1(nullptr, 0); + dView1 dv_1_1(nullptr, N0); dView1 dv_1_2(hv_1.label(), hv_1.layout()); - dView2 dv_2_1(nullptr, 0); + dView2 dv_2_1(nullptr, N0); dView2 dv_2_2(hv_2.label(), hv_2.layout()); - dView3 dv_3_1(nullptr, 0); + dView3 dv_3_1(nullptr, N0); dView3 dv_3_2(hv_3.label(), hv_3.layout()); - dView4 dv_4_1(nullptr, 0); + dView4 dv_4_1(nullptr, N0); dView4 dv_4_2(hv_4.label(), hv_4.layout()); } static void run_test_contruction_from_layout_2() { using dView3_0 = Kokkos::View; - using dView3_1 = Kokkos::View; + using dView3_1 = Kokkos::View; using dView3_2 = Kokkos::View; using dView3_3 = Kokkos::View; diff --git a/core/unit_test/TestViewCtorDimMatch.hpp b/core/unit_test/TestViewCtorDimMatch.hpp index d71841eef84..40b7737f2e4 100644 --- a/core/unit_test/TestViewCtorDimMatch.hpp +++ b/core/unit_test/TestViewCtorDimMatch.hpp @@ -19,33 +19,72 @@ namespace Test { -#define LIVE(EXPR, ARGS, DYNRANK) EXPECT_NO_THROW(EXPR) -#define DIE(EXPR, ARGS, DYNRANK) \ - ASSERT_DEATH( \ - EXPR, \ - "Constructor for Kokkos View 'v_" #ARGS \ - "' has mismatched number of arguments. Number of arguments = " #ARGS \ - " but dynamic rank = " #DYNRANK) +template +void test_matching_arguments_rank_helper(std::index_sequence) { + constexpr int nargs = sizeof...(Is); + using view_type = Kokkos::View; + if (nargs == rank || nargs == dynrank) { + EXPECT_NO_THROW({ view_type v("v", ((Is * 0) + 1)...); }); + EXPECT_NO_THROW({ view_type v(nullptr, ((Is * 0) + 1)...); }); + } else { + ASSERT_DEATH( + { view_type v("v", ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'v' has mismatched number of arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + ASSERT_DEATH( + { view_type v(nullptr, ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'UNMANAGED' has mismatched number of " + "arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + } +} -#define PARAM_0 -#define PARAM_1 1 -#define PARAM_2 1, 1 -#define PARAM_3 1, 1, 1 -#define PARAM_4 1, 1, 1, 1 -#define PARAM_5 1, 1, 1, 1, 1 -#define PARAM_6 1, 1, 1, 1, 1, 1 -#define PARAM_7 1, 1, 1, 1, 1, 1, 1 +template class RankType> +void test_matching_arguments_rank() { + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<0>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<1>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<2>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<3>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<4>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<5>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<6>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<7>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<8>()); +} -#define PARAM_0_RANK 0 -#define PARAM_1_RANK 1 -#define PARAM_2_RANK 2 -#define PARAM_3_RANK 3 -#define PARAM_4_RANK 4 -#define PARAM_5_RANK 5 -#define PARAM_6_RANK 6 -#define PARAM_7_RANK 7 +template +struct DynamicRank { + using type = typename DynamicRank::type*; +}; -using DType = int; +template <> +struct DynamicRank<0> { + using type = int; +}; // Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until // Kokkos::abort() aborts properly on that backend @@ -53,348 +92,110 @@ using DType = int; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType *; - using DType_2 = DType **; - using DType_3 = DType ***; - using DType_4 = DType ****; - using DType_5 = DType *****; - using DType_6 = DType ******; - using DType_7 = DType *******; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 2, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } - - { - // test View parameters for View dim = 3, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } - - { - // test View parameters for View dim = 4, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } - - { - // test View parameters for View dim = 5, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, DynamicRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 1, DynamicRank>(); // dim = 1, dynamic = 1 + test_matching_arguments_rank<2, 2, DynamicRank>(); // dim = 2, dynamic = 2 + test_matching_arguments_rank<3, 3, DynamicRank>(); // dim = 3, dynamic = 3 + test_matching_arguments_rank<4, 4, DynamicRank>(); // dim = 4, dynamic = 4 + test_matching_arguments_rank<5, 5, DynamicRank>(); // dim = 5, dynamic = 5 + test_matching_arguments_rank<6, 6, DynamicRank>(); // dim = 6, dynamic = 6 + test_matching_arguments_rank<7, 7, DynamicRank>(); // dim = 7, dynamic = 7 + test_matching_arguments_rank<8, 8, DynamicRank>(); // dim = 8, dynamic = 8 +#endif +} - { - // test View parameters for View dim = 6, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } +template +struct StaticRank { + using type = typename StaticRank::type[1]; +}; - { - // test View parameters for View dim = 7, dynamic = 7 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 7); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 7); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 7); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 7); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 7); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 7); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 7); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 7); - } -} +template <> +struct StaticRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_stat) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType[1][1]; - using DType_3 = DType[1][1][1]; - using DType_4 = DType[1][1][1][1]; - using DType_5 = DType[1][1][1][1][1]; - using DType_6 = DType[1][1][1][1][1][1]; - using DType_7 = DType[1][1][1][1][1][1][1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 3, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 4, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, StaticRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, StaticRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 0, StaticRank>(); // dim = 2, dynamic = 0 + test_matching_arguments_rank<3, 0, StaticRank>(); // dim = 3, dynamic = 0 + test_matching_arguments_rank<4, 0, StaticRank>(); // dim = 4, dynamic = 0 + test_matching_arguments_rank<5, 0, StaticRank>(); // dim = 5, dynamic = 0 + test_matching_arguments_rank<6, 0, StaticRank>(); // dim = 6, dynamic = 0 + test_matching_arguments_rank<7, 0, StaticRank>(); // dim = 7, dynamic = 0 + test_matching_arguments_rank<8, 0, StaticRank>(); // dim = 8, dynamic = 0 +#endif +} - { - // test View parameters for View dim = 5, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } +template +struct MixedRank { + using type = typename DynamicRank::type[1]; +}; - { - // test View parameters for View dim = 6, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 7, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } -} +template <> +struct MixedRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_mix) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType * [1]; - using DType_3 = DType * * [1]; - using DType_4 = DType ** * [1]; - using DType_5 = DType *** * [1]; - using DType_6 = DType **** * [1]; - using DType_7 = DType ***** * [1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 3, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, MixedRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, MixedRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 1, MixedRank>(); // dim = 2, dynamic = 1 + test_matching_arguments_rank<3, 2, MixedRank>(); // dim = 3, dynamic = 2 + test_matching_arguments_rank<4, 3, MixedRank>(); // dim = 4, dynamic = 3 + test_matching_arguments_rank<5, 4, MixedRank>(); // dim = 5, dynamic = 4 + test_matching_arguments_rank<6, 5, MixedRank>(); // dim = 6, dynamic = 5 + test_matching_arguments_rank<7, 6, MixedRank>(); // dim = 7, dynamic = 6 + test_matching_arguments_rank<8, 7, MixedRank>(); // dim = 8, dynamic = 7 +#endif +} - { - // test View parameters for View dim = 4, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } +#define CHECK_DEATH(EXPR) \ + ASSERT_DEATH(EXPR, \ + "The specified run-time extent for Kokkos::View 'v' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 5, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } +#define CHECK_DEATH_UNMANAGED(EXPR) \ + ASSERT_DEATH( \ + EXPR, \ + "The specified run-time extent for Kokkos::View 'UNMANAGED' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 6, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } +TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_static_extents) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - { - // test View parameters for View dim = 7, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + // clang-format off + CHECK_DEATH({ Kokkos::View v("v", 2); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1, 1); }); + + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1, 1); }); + // clang-format on +#endif } -#endif // KOKKOS_ENABLE_OPENMPTARGET - -#undef PARAM_0 -#undef PARAM_1 -#undef PARAM_2 -#undef PARAM_3 -#undef PARAM_4 -#undef PARAM_5 -#undef PARAM_6 -#undef PARAM_7 -#undef PARAM_0_RANK -#undef PARAM_1_RANK -#undef PARAM_2_RANK -#undef PARAM_3_RANK -#undef PARAM_4_RANK -#undef PARAM_5_RANK -#undef PARAM_6_RANK -#undef PARAM_7_RANK - -#undef DType - -#undef LIVE -#undef DIE +#undef CHECK_DEATH +#endif // KOKKOS_ENABLE_OPENMPTARGET } // namespace Test From 96d530a24034467db26070081b62a0a23c60530e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 9 Jan 2024 16:46:59 -0500 Subject: [PATCH 205/432] Remove Kokkos::[b]half_t volatile overloads (#6579) * Deprecate Kokkos::[b]half_t volatile overloads * Remove volatile [b]half_t altogether --- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 114 +---------------- core/unit_test/TestHalfOperators.hpp | 117 ------------------ 2 files changed, 4 insertions(+), 227 deletions(-) diff --git a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index 4a22898d168..bcce013b00e 100644 --- a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -196,12 +196,12 @@ KOKKOS_INLINE_FUNCTION template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&); + T x, const Kokkos::Impl::half_impl_t::type&); #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&); + T x, const Kokkos::Impl::bhalf_impl_t::type&); #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED template @@ -283,13 +283,6 @@ class alignas(FloatType) floating_point_wrapper { private: impl_type val; - using fixed_width_integer_type = std::conditional_t< - sizeof(impl_type) == 2, uint16_t, - std::conditional_t< - sizeof(impl_type) == 4, uint32_t, - std::conditional_t>>; - static_assert(!std::is_void::value, - "Invalid impl_type"); public: // In-class initialization and defaulted default constructors not used @@ -318,18 +311,6 @@ class alignas(FloatType) floating_point_wrapper { default; #endif - KOKKOS_INLINE_FUNCTION - floating_point_wrapper(const volatile floating_point_wrapper& rhs) { -#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL) - val = rhs.val; -#else - const volatile fixed_width_integer_type* rv_ptr = - reinterpret_cast(&rhs.val); - const fixed_width_integer_type rv_val = *rv_ptr; - val = reinterpret_cast(rv_val); -#endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - } - KOKKOS_FUNCTION floating_point_wrapper(bit_comparison_type rhs) { val = Kokkos::bit_cast(rhs); @@ -492,15 +473,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - template - KOKKOS_FUNCTION void operator=(T rhs) volatile { - impl_type new_val = cast_to_wrapper(rhs, val).val; - volatile fixed_width_integer_type* val_ptr = - reinterpret_cast( - const_cast(&val)); - *val_ptr = reinterpret_cast(new_val); - } - // Compound operators KOKKOS_FUNCTION floating_point_wrapper& operator+=(floating_point_wrapper rhs) { @@ -515,15 +487,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator+=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs += tmp_rhs; - *this = tmp_lhs; - } - // Compound operators: upcast overloads for += template KOKKOS_FUNCTION friend std::enable_if_t< @@ -560,15 +523,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator-=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs -= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for -= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -605,15 +559,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator*=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs *= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for *= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -650,15 +595,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator/=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs /= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for /= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -884,27 +820,6 @@ class alignas(FloatType) floating_point_wrapper { #endif } - KOKKOS_FUNCTION - friend bool operator==(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs == tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator!=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs != tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator<(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs < tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -923,13 +838,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs < static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs > tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -948,13 +856,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs > static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator<=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs <= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -973,13 +874,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs <= static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs >= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -1018,14 +912,14 @@ class alignas(FloatType) floating_point_wrapper { // Declare wrapper overloads now that floating_point_wrapper is declared template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&) { + T x, const Kokkos::Impl::half_impl_t::type&) { return Kokkos::Experimental::cast_to_half(x); } #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) { + T x, const Kokkos::Impl::bhalf_impl_t::type&) { return Kokkos::Experimental::cast_to_bhalf(x); } #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp index 752e3b50816..c69cdd57034 100644 --- a/core/unit_test/TestHalfOperators.hpp +++ b/core/unit_test/TestHalfOperators.hpp @@ -268,96 +268,6 @@ enum OP_TESTS { N_OP_TESTS }; -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) -template -struct Functor_TestHalfVolatileOperators { - volatile half_type h_lhs, h_rhs; - view_type actual_lhs, expected_lhs; - double d_lhs, d_rhs; - Functor_TestHalfVolatileOperators(volatile half_type lhs = half_type(0), - volatile half_type rhs = half_type(0)) - : h_lhs(lhs), h_rhs(rhs) { - actual_lhs = view_type("actual_lhs", N_OP_TESTS); - expected_lhs = view_type("expected_lhs", N_OP_TESTS); - half_type nv_tmp; - nv_tmp = h_lhs; - d_lhs = static_cast(nv_tmp); - nv_tmp = h_rhs; - d_rhs = static_cast(nv_tmp); - if (std::is_same::value) { - auto run_on_host = *this; - run_on_host(0); - } else { - Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators", - Kokkos::RangePolicy(0, 1), *this); - } - } - - KOKKOS_FUNCTION - void operator()(int) const { - volatile half_type tmp_lhs; - half_type nv_tmp; - - // Initialze output views to catch missing test invocations - for (int i = 0; i < N_OP_TESTS; ++i) { - actual_lhs(i) = 1; - expected_lhs(i) = -1; - } - - nv_tmp = h_lhs; - actual_lhs(ASSIGN) = static_cast(nv_tmp); - expected_lhs(ASSIGN) = d_lhs; - - actual_lhs(LT_H_H) = h_lhs < h_rhs; - expected_lhs(LT_H_H) = d_lhs < d_rhs; - - actual_lhs(LE_H_H) = h_lhs <= h_rhs; - expected_lhs(LE_H_H) = d_lhs <= d_rhs; - - actual_lhs(NEQ) = h_lhs != h_rhs; - expected_lhs(NEQ) = d_lhs != d_rhs; - - actual_lhs(GT_H_H) = h_lhs > h_rhs; - expected_lhs(GT_H_H) = d_lhs > d_rhs; - - actual_lhs(GE_H_H) = h_lhs >= h_rhs; - expected_lhs(GE_H_H) = d_lhs >= d_rhs; - - actual_lhs(EQ) = h_lhs == h_rhs; - expected_lhs(EQ) = d_lhs == d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs += h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CADD_H_H) = static_cast(nv_tmp); - expected_lhs(CADD_H_H) = d_lhs; - expected_lhs(CADD_H_H) += d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs -= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CSUB_H_H) = static_cast(nv_tmp); - expected_lhs(CSUB_H_H) = d_lhs; - expected_lhs(CSUB_H_H) -= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs *= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CMUL_H_H) = static_cast(nv_tmp); - expected_lhs(CMUL_H_H) = d_lhs; - expected_lhs(CMUL_H_H) *= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs /= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CDIV_H_H) = static_cast(nv_tmp); - expected_lhs(CDIV_H_H) = d_lhs; - expected_lhs(CDIV_H_H) /= d_rhs; - } -}; -#endif - template struct Functor_TestHalfOperators { half_type h_lhs, h_rhs; @@ -995,33 +905,6 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { static_cast(epsilon)); } -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) - // Test partial volatile support - volatile half_type _h_lhs = h_lhs; - volatile half_type _h_rhs = h_rhs; - Functor_TestHalfVolatileOperators f_volatile_device( - _h_lhs, _h_rhs); - Functor_TestHalfVolatileOperators f_volatile_host( - _h_lhs, _h_rhs); - - ExecutionSpace().fence(); - Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs); - Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); - for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { - // printf("op_test = %d\n", op_test); - if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H || - op_test == NEQ || op_test == EQ || op_test == GT_H_H || - op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || - op_test == CMUL_H_H || op_test == CDIV_H_H) { - ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - static_cast(epsilon)); - ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - static_cast(epsilon)); - } - } -#endif - // is_trivially_copyable is false with the addition of explicit // copy constructors that are required for supporting reductions // ASSERT_TRUE(std::is_trivially_copyable::value); From 67340103852ddcc11cd67695cb6520934d1c333a Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Wed, 10 Jan 2024 09:47:26 +0100 Subject: [PATCH 206/432] add tests --- ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 41 +++++-- ...estStdAlgorithmsTransformInclusiveScan.cpp | 105 ++++++++++++------ 2 files changed, 100 insertions(+), 46 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 4b316602326..10454d65515 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -131,7 +131,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -157,9 +159,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -179,12 +178,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -236,16 +244,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { } #undef transform_inclusive_scan - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -255,6 +268,10 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformInclusiveScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index d96e582582f..fb81ae91b04 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -173,24 +173,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -211,30 +202,11 @@ struct SumBinaryFunctor { std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << std::endl; -} - -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop, - ValueType init_value) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << ", " - << "init = " << init_value << std::endl; -} - template void run_single_scenario(const InfoType& scenario_info, Args... args /* by value on purpose*/) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // print_scenario_details(name, args...); auto view_dest = create_view(Tag{}, view_ext, "transform_inclusive_scan"); @@ -279,6 +251,63 @@ void run_single_scenario(const InfoType& scenario_info, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* by value on purpose*/) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to scenario and is not modified + // view2: filled according scenario and used for the in-place op + // Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan. + // NOTE: view2 must be filled before every call to the algorithm + // because the algorithm acts in place + + auto view_1 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_1"); + fill_view(view_1, name); + + auto view_2 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_2"); + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_2), + KE::cend(view_2), KE::begin(view_2), + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), + KE::cbegin(view_2), KE::cend(view_2), + KE::begin(view_2), args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), view_2, view_2, args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), view_2, view_2, + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -295,15 +324,23 @@ void run_all_scenarios() { run_single_scenario(it, bop_t(), uop_t(), ValueType{2}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-1}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-2}); + + run_single_scenario_inplace(it, bop_t(), uop_t()); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{0}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{2}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{-2}); } } #if !defined KOKKOS_ENABLE_OPENMPTARGET TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) { run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } #endif From 0ba8c40fc56c54bc0fd5808a23cf44e48749c187 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 10 Jan 2024 17:29:05 -0500 Subject: [PATCH 207/432] Provide `kokkos_swap` as part of Core and deprecate `Experimental::swap` in Algorithms (#6697) * Move Kokkos_Swap.hpp header to core * Include only what we use (in ) * Rename header guards in (because I can) * Drop useless comment * Fix noexcept specification of Kokkos::swap * Constexprify Kokkos::swap * Fixup type requirements in Kokkos::swap * Drop the inline (because I can) * fixup! Move Kokkos_Swap.hpp header to core * Promote swap to namespace Kokkos:: * Move and update Kokkos::swap unit test to core * Add support for swapping arrays * Fixup size_t * Provide Kokkos::swap overload for a few classes * Rename Kokkos::swap -> Kokkos::kokkos_swap * For backward compatibility provide deprecated Experimental::swap --- algorithms/src/Kokkos_StdAlgorithms.hpp | 1 - .../sorting/impl/Kokkos_NestedSortImpl.hpp | 3 +- .../src/std_algorithms/Kokkos_IterSwap.hpp | 13 +++- algorithms/src/std_algorithms/Kokkos_Swap.hpp | 41 ----------- .../std_algorithms/impl/Kokkos_Reverse.hpp | 3 +- .../std_algorithms/impl/Kokkos_SwapRanges.hpp | 3 +- .../unit_tests/TestStdAlgorithmsModOps.cpp | 44 ------------ core/src/Kokkos_Array.hpp | 1 + core/src/Kokkos_Core.hpp | 1 + core/src/Kokkos_Pair.hpp | 2 +- core/src/Kokkos_Swap.hpp | 68 +++++++++++++++++++ core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestSwap.hpp | 68 +++++++++++++++++++ 13 files changed, 154 insertions(+), 95 deletions(-) delete mode 100644 algorithms/src/std_algorithms/Kokkos_Swap.hpp create mode 100644 core/src/Kokkos_Swap.hpp create mode 100644 core/unit_test/TestSwap.hpp diff --git a/algorithms/src/Kokkos_StdAlgorithms.hpp b/algorithms/src/Kokkos_StdAlgorithms.hpp index 436ae0d10bf..b532a774e13 100644 --- a/algorithms/src/Kokkos_StdAlgorithms.hpp +++ b/algorithms/src/Kokkos_StdAlgorithms.hpp @@ -35,7 +35,6 @@ // following the std classification. // modifying ops -#include "std_algorithms/Kokkos_Swap.hpp" #include "std_algorithms/Kokkos_IterSwap.hpp" // non-modifying sequence diff --git a/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp index 50ac8233195..2fe58272d92 100644 --- a/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp @@ -18,7 +18,6 @@ #define KOKKOS_NESTED_SORT_IMPL_HPP_ #include -#include namespace Kokkos { namespace Experimental { @@ -99,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void sort_nested_impl( keyView(elem1) = key2; keyView(elem2) = key1; if constexpr (!std::is_same_v) { - Kokkos::Experimental::swap(valueView(elem1), valueView(elem2)); + Kokkos::kokkos_swap(valueView(elem1), valueView(elem2)); } } } diff --git a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp index a796a306dda..5bb2d1039dc 100644 --- a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp +++ b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp @@ -19,7 +19,6 @@ #include #include "impl/Kokkos_Constraints.hpp" -#include "Kokkos_Swap.hpp" namespace Kokkos { namespace Experimental { @@ -33,7 +32,7 @@ struct StdIterSwapFunctor { KOKKOS_FUNCTION void operator()(int i) const { (void)i; - ::Kokkos::Experimental::swap(*m_a, *m_b); + ::Kokkos::kokkos_swap(*m_a, *m_b); } KOKKOS_FUNCTION @@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) { Impl::iter_swap_impl(a, b); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!") +KOKKOS_FUNCTION + void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval(), + std::declval())) { + ::Kokkos::kokkos_swap(a, b); +} +#endif + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_Swap.hpp b/algorithms/src/std_algorithms/Kokkos_Swap.hpp deleted file mode 100644 index acd2a572c8c..00000000000 --- a/algorithms/src/std_algorithms/Kokkos_Swap.hpp +++ /dev/null @@ -1,41 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_STD_ALGORITHMS_SWAP_HPP -#define KOKKOS_STD_ALGORITHMS_SWAP_HPP - -#include - -namespace Kokkos { -namespace Experimental { - -// swap -template -KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept { - static_assert( - std::is_move_assignable::value && std::is_move_constructible::value, - "Kokkos::Experimental::swap arguments must be move assignable " - "and move constructible"); - - T tmp = std::move(a); - a = std::move(b); - b = std::move(tmp); -} - -} // namespace Experimental -} // namespace Kokkos - -#endif diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index 428dc0d744a..b4046c7645b 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -39,7 +38,7 @@ struct StdReverseFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]); + ::Kokkos::kokkos_swap(m_first[i], m_last[-i - 1]); } KOKKOS_FUNCTION diff --git a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp index 5bc77ed7ddc..930a14ac48c 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -36,7 +35,7 @@ struct StdSwapRangesFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]); + ::Kokkos::kokkos_swap(m_first1[i], m_first2[i]); } KOKKOS_FUNCTION diff --git a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index c0130885dc5..1b1a02f39c4 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -89,50 +89,6 @@ TEST(std_algorithms_mod_ops_test, move_within_parfor) { } } -// ------------ -// swap -// ------------ -TEST(std_algorithms_mod_ops_test, swap) { - { - int a = 1; - int b = 2; - KE::swap(a, b); - ASSERT_EQ(a, 2); - ASSERT_EQ(b, 1); - } - - { - double a = 3.; - double b = 1.; - KE::swap(a, b); - EXPECT_DOUBLE_EQ(a, 1.); - EXPECT_DOUBLE_EQ(b, 3.); - } -} - -template -struct StdAlgoModSeqOpsTestSwap { - ViewType m_view; - - KOKKOS_INLINE_FUNCTION - void operator()(const int index) const { - typename ViewType::value_type newval{11}; - KE::swap(m_view(index), newval); - } - - StdAlgoModSeqOpsTestSwap(ViewType aIn) : m_view(aIn) {} -}; - -TEST(std_algorithms_mod_ops_test, swap_within_parfor) { - auto a = create_view(stdalgos::DynamicTag{}, 10, "a"); - StdAlgoModSeqOpsTestSwap fnc(a); - Kokkos::parallel_for(a.extent(0), fnc); - auto a_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a); - for (std::size_t i = 0; i < a.extent(0); ++i) { - EXPECT_DOUBLE_EQ(a_h(0), 11.); - } -} - // ------------ // iter_swap // ------------ diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 7773d20339a..ba1626bb72e 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -22,6 +22,7 @@ #endif #include +#include #include #include diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index cde77dc3e90..d9e5640cd9a 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 7127c78280e..9be8d8d7aa1 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -28,6 +28,7 @@ #endif #include +#include #include namespace Kokkos { @@ -484,7 +485,6 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( } namespace Impl { - template struct is_pair_like : std::false_type {}; template diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp new file mode 100644 index 00000000000..2f849a13ab6 --- /dev/null +++ b/core/src/Kokkos_Swap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SWAP_HPP +#define KOKKOS_SWAP_HPP + +#include + +#include +#include +#include + +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr std::enable_if_t && + std::is_move_assignable_v> +kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& + std::is_nothrow_move_assignable_v) { + T t(std::move(a)); + a = std::move(b); + b = std::move(t); +} + +namespace Impl { + +template +struct is_swappable { + template + static decltype(kokkos_swap(std::declval(), std::declval())) + test_swap(int); + struct Nope; + template + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v(0)), Nope>; +}; + +template +inline constexpr bool is_nothrow_swappable_v = + noexcept(kokkos_swap(std::declval(), std::declval())); + +} // namespace Impl + +template +KOKKOS_FUNCTION constexpr std::enable_if_t::value> +kokkos_swap(T (&a)[N], T (&b)[N]) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } +} + +} // namespace Kokkos + +#endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 72a34afa2bc..7bbf72c2533 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -200,6 +200,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Reductions Reductions_DeviceView SharedAlloc + Swap ) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid diff --git a/core/unit_test/TestSwap.hpp b/core/unit_test/TestSwap.hpp new file mode 100644 index 00000000000..4e98351cf19 --- /dev/null +++ b/core/unit_test/TestSwap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include + +namespace { + +template +struct TestSwap { + KOKKOS_FUNCTION void operator()(int, int& err) const { + { + int a = 1; + int b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int, int)\n"); + ++err; + } + } + { + float a = 1; + float b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(float, float)\n"); + ++err; + } + } + { + int a[3] = {1, 2, 3}; + int b[3] = {4, 5, 6}; + Kokkos::kokkos_swap(a, b); + if (!(a[0] == 4 && a[1] == 5 && a[2] == 6 && b[0] == 1 && b[1] == 2 && + b[2] == 3)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int[3], int[3])\n"); + ++err; + } + } + } + + TestSwap() { + int errors; + Kokkos::parallel_reduce( + "TestSwap", Kokkos::RangePolicy(0, 1), *this, errors); + EXPECT_EQ(errors, 0); + } +}; + +TEST(TEST_CATEGORY, kokkos_swap) { TestSwap(); } + +} // namespace From 89ba3fbae8ab2ae2dbfd30eef05c3bac387630a5 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Thu, 11 Jan 2024 11:13:24 -0500 Subject: [PATCH 208/432] Provide new public headers `` and `` (#6687) * Promote min, max, clamp to public * Drop unnecessary header includes * Split Kokkos_MinMaxClamp.hpp into Kokkos_MinMax.hpp and Kokkos_Clamp.hpp --------- Co-authored-by: Damien L-G --- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 1 - core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 1 - core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 2 +- .../src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp | 2 +- core/src/Kokkos_Clamp.hpp | 41 +++++++++++++++++++ core/src/Kokkos_Core.hpp | 3 +- ...kkos_MinMaxClamp.hpp => Kokkos_MinMax.hpp} | 25 +---------- core/src/Kokkos_View.hpp | 2 +- 8 files changed, 48 insertions(+), 29 deletions(-) create mode 100644 core/src/Kokkos_Clamp.hpp rename core/src/{Kokkos_MinMaxClamp.hpp => Kokkos_MinMax.hpp} (86%) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 13feed64e3e..d1b0dbb815f 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index fcbd75c57f9..08cb3a3ed45 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 23dc569ab15..6496f47462a 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp index 984fa9d3d2d..67e1181125c 100644 --- a/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp +++ b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP #define KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP -#include +#include namespace Kokkos { namespace Impl { diff --git a/core/src/Kokkos_Clamp.hpp b/core/src/Kokkos_Clamp.hpp new file mode 100644 index 00000000000..033cde9ab84 --- /dev/null +++ b/core/src/Kokkos_Clamp.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_CLAMP_HPP +#define KOKKOS_CLAMP_HPP + +#include + +namespace Kokkos { + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi) { + KOKKOS_EXPECTS(!(hi < lo)); + return (value < lo) ? lo : (hi < value) ? hi : value; +} + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi, + ComparatorType comp) { + KOKKOS_EXPECTS(!comp(hi, lo)); + return comp(value, lo) ? lo : comp(hi, value) ? hi : value; +} + +} // namespace Kokkos + +#endif diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index d9e5640cd9a..b4863620cde 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -47,7 +47,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/core/src/Kokkos_MinMaxClamp.hpp b/core/src/Kokkos_MinMax.hpp similarity index 86% rename from core/src/Kokkos_MinMaxClamp.hpp rename to core/src/Kokkos_MinMax.hpp index 09ae9689f62..5c60a88bfb1 100644 --- a/core/src/Kokkos_MinMaxClamp.hpp +++ b/core/src/Kokkos_MinMax.hpp @@ -14,13 +14,8 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MIN_MAX_CLAMP_HPP -#define KOKKOS_MIN_MAX_CLAMP_HPP +#ifndef KOKKOS_MIN_MAX_HPP +#define KOKKOS_MIN_MAX_HPP #include #include @@ -29,22 +24,6 @@ static_assert(false, namespace Kokkos { -// clamp -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi) { - KOKKOS_EXPECTS(!(hi < lo)); - return (value < lo) ? lo : (hi < value) ? hi : value; -} - -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi, - ComparatorType comp) { - KOKKOS_EXPECTS(!comp(hi, lo)); - return comp(value, lo) ? lo : comp(hi, value) ? hi : value; -} - // max template constexpr KOKKOS_INLINE_FUNCTION const T& max(const T& a, const T& b) { diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 7a2e0eb0323..484a0e6f62e 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -39,7 +39,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include #endif -#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- From ee5cbfc25c81b9efc5b2a4abd82d9a9d5d3d07bd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 11 Jan 2024 13:51:28 -0500 Subject: [PATCH 209/432] Fix TeamThreadMDRange parallel_reduce (#6511) * Fix TeamThreadRangeMD parallel_reduce * Use vector_reduce for Cuda, HIP, and SYCL * Initialize reduction variable * Test with maximal vector length * TeamHandle::execution_space->typename TeamHandle::execution_space * Fix OpenMPTarget * Restore tests * reducer is unused * Workarounds for OpenMPTarget * Reduce * Fix OpenMPTarget tests * Restore tests * Strengthen test case for OpenMPTarget * Add some static_asserts --- .../unit_tests/TestStdAlgorithmsCommon.hpp | 8 - core/src/Kokkos_ExecPolicy.hpp | 55 ++ .../Kokkos_OpenMPTarget_Parallel.hpp | 21 +- core/unit_test/TestTeamMDRange.hpp | 475 ++++++++++++++---- 4 files changed, 448 insertions(+), 111 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 3eb963faf2d..67052e2f9d4 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -239,16 +239,8 @@ KOKKOS_FUNCTION bool team_members_have_matching_result( // set accum to 1 if a mismach is found const bool mismatch = memberValue != target; int accum = static_cast(mismatch); - // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and - // ignores the reducer passed -#if defined KOKKOS_ENABLE_OPENMPTARGET - Kokkos::Sum dummyReducer(accum); - const auto result = teamHandle.team_reduce(accum, dummyReducer); - return (result == 0); -#else teamHandle.team_reduce(Kokkos::Sum(accum)); return (accum == 0); -#endif } template diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index ae1585a4989..389dcf19368 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -983,7 +983,16 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + policy.team.team_reduce( + Kokkos::Sum{val}); } template @@ -997,7 +1006,29 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); } template @@ -1011,7 +1042,31 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); + policy.team.team_reduce( + Kokkos::Sum{val}); } template diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 466dee2a563..dcc509d2faf 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -107,14 +107,20 @@ class OpenMPTargetExecTeamMember { team_broadcast(value, thread_id); } - // FIXME_OPENMPTARGET this function has the wrong interface and currently - // ignores the reducer passed. - template - KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, - const JoinOp&) const { + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + // FIXME_OPENMPTARGET this function currently ignores the reducer passed. + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const&, typename ReducerType::value_type& value) const + noexcept { #pragma omp barrier - using value_type = ValueType; + using value_type = typename ReducerType::value_type; // const JoinLambdaAdapter op(op_in); // Make sure there is enough scratch space: @@ -143,8 +149,9 @@ class OpenMPTargetExecTeamMember { } #pragma omp barrier } - return team_scratch[0]; + value = team_scratch[0]; } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. * diff --git a/core/unit_test/TestTeamMDRange.hpp b/core/unit_test/TestTeamMDRange.hpp index 6e65cde0cf8..7948dd8b1a5 100644 --- a/core/unit_test/TestTeamMDRange.hpp +++ b/core/unit_test/TestTeamMDRange.hpp @@ -169,7 +169,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -202,7 +209,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -236,7 +250,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -272,7 +293,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -310,7 +338,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -350,7 +385,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -420,7 +462,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -457,7 +506,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -496,7 +552,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -536,7 +599,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -579,7 +649,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -620,7 +697,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -653,7 +737,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -687,7 +778,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -723,7 +821,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -761,7 +866,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -801,7 +913,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -908,13 +1027,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k) = fillFlattenedIndex(i, j, k); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -923,7 +1049,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -952,13 +1084,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -966,7 +1105,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& threadSum) { threadSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -997,13 +1142,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1013,7 +1165,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1045,13 +1203,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1061,7 +1226,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1100,13 +1271,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1116,7 +1294,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1157,13 +1341,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1174,7 +1365,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1207,10 +1404,17 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { auto leagueRank = team.league_rank(); DataType teamSum = 0; @@ -1263,10 +1467,17 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { auto leagueRank = team.league_rank(); DataType teamSum = 0; @@ -1321,10 +1532,17 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { auto leagueRank = team.league_rank(); DataType teamSum = 0; @@ -1384,10 +1602,17 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { auto leagueRank = team.league_rank(); DataType teamSum = 0; @@ -1451,10 +1676,17 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { auto leagueRank = team.league_rank(); DataType teamSum = 0; @@ -1510,13 +1742,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1527,7 +1766,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& vectorSum) { vectorSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1558,13 +1803,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1577,7 +1829,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1609,13 +1867,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1628,7 +1893,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1665,13 +1936,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1684,7 +1962,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1725,13 +2009,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1745,7 +2036,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1904,13 +2201,6 @@ TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestThreadVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_ThreadVectorMDRange(dims); TestThreadVectorMDRangeParallelReduce:: @@ -1944,13 +2234,6 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestTeamVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_TeamVectorMDRange(dims); TestTeamVectorMDRangeParallelReduce:: From 36da6cca7fb4b4b7082f0ec91b4e388578787e8f Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 11 Jan 2024 21:15:40 +0100 Subject: [PATCH 210/432] add tests for in-place `inclusive_scan` (#6682) * add test for inclusive scan in place * refine tests * remove comment * revert transform since it is moved to separate pr --- .../TestStdAlgorithmsInclusiveScan.cpp | 225 +++++++++--------- .../TestStdAlgorithmsTeamInclusiveScan.cpp | 40 +++- 2 files changed, 144 insertions(+), 121 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index fa6294ea4ca..a08a7372108 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -144,51 +144,6 @@ void my_host_inclusive_scan(it1 first, it1 last, it2 dest, BinOp bop, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - BinaryOp bop, Args... args /* copy on purpose */) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), bop, args...); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - - const auto ext = test_view_h.extent(0); - if (ext > 0) { - for (std::size_t i = 0; i < ext; ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - // std::cout << " last el: " << test_view_h(ext-1) << std::endl; - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -205,107 +160,151 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + BinaryOp bop, Args... args /* copy on purpose */) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), bop, args...); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + + const auto ext = test_view_h.extent(0); + if (ext > 0) { + for (std::size_t i = 0; i < ext; ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view) // the view to test + { + using value_type = typename ViewType1::non_const_value_type; + (*this)(data_view, test_view, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "inclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << std::endl; auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan(exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest); + auto r = KE::inclusive_scan(exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest); + auto r = + KE::inclusive_scan("label", exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, - Args... args /* copy on purpose */) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // if (1 == sizeof...(Args)) { - // std::cout << "inclusive_scan custom op and init value: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } else { - // std::cout << "inclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place - auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); - fill_view(view_from, name); + auto view1 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } Kokkos::fence(); @@ -319,27 +318,35 @@ void run_inclusive_scan_all_scenarios() { {"medium-a", 313}, {"medium-b", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it); + run_single_scenario(it); + run_single_scenario_inplace(it); #if !defined KOKKOS_ENABLE_OPENMPTARGET // the sum custom op is always run using sum_binary_op = SumFunctor; sum_binary_op sbop; - run_single_scenario_custom_op(it, sbop); - run_single_scenario_custom_op(it, sbop, ValueType{0}); - run_single_scenario_custom_op(it, sbop, ValueType{1}); - run_single_scenario_custom_op(it, sbop, ValueType{-2}); - run_single_scenario_custom_op(it, sbop, ValueType{3}); + run_single_scenario(it, sbop); + run_single_scenario(it, sbop, ValueType{0}); + run_single_scenario(it, sbop, ValueType{1}); + run_single_scenario(it, sbop, ValueType{-2}); + run_single_scenario(it, sbop, ValueType{3}); + + run_single_scenario_inplace(it, sbop, ValueType{0}); + run_single_scenario_inplace(it, sbop, ValueType{-2}); // custom multiply only for small views to avoid overflows if (it.first == "small-a" || it.first == "small-b") { using mult_binary_op = MultiplyFunctor; mult_binary_op mbop; - run_single_scenario_custom_op(it, mbop); - run_single_scenario_custom_op(it, mbop, ValueType{0}); - run_single_scenario_custom_op(it, mbop, ValueType{1}); - run_single_scenario_custom_op(it, mbop, ValueType{-2}); - run_single_scenario_custom_op(it, mbop, ValueType{3}); + run_single_scenario(it, mbop); + run_single_scenario(it, mbop, ValueType{0}); + run_single_scenario(it, mbop, ValueType{1}); + run_single_scenario(it, mbop, ValueType{-2}); + run_single_scenario(it, mbop, ValueType{3}); + + run_single_scenario_inplace(it, mbop); + run_single_scenario_inplace(it, mbop, ValueType{0}); + run_single_scenario_inplace(it, mbop, ValueType{-2}); } #endif } diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp index 0daf9dbfe82..642a8494390 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -139,7 +139,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -165,9 +167,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -186,12 +185,20 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -251,16 +258,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef inclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3, 4, 5}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -270,6 +282,10 @@ TEST(std_algorithms_inclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamInclusiveScan From 0254c631bb2496fb80669d050e53d37d258fd4d1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 11 Jan 2024 23:08:33 -0500 Subject: [PATCH 211/432] Drop pointless Kokkos::Impl::CudaExec forward declaration There is no defintion for it... --- core/src/Cuda/Kokkos_Cuda.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 3fda01964f6..3e2c6c7f928 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -46,7 +46,6 @@ static_assert(false, namespace Kokkos { namespace Impl { -class CudaExec; class CudaInternal; } // namespace Impl } // namespace Kokkos From 9393b358fb1300d15c41957c42a566f329bcac85 Mon Sep 17 00:00:00 2001 From: Ben Cowan Date: Fri, 12 Jan 2024 11:17:17 -0700 Subject: [PATCH 212/432] Don't use the compiler launcher script if the compile language is CUDA. (#6704) * Don't use the compiler launcher script if the compile language is CUDA. * Updated descriptive comment. --- cmake/KokkosConfig.cmake.in | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cmake/KokkosConfig.cmake.in b/cmake/KokkosConfig.cmake.in index e26c75b3122..1b6d1b66ff5 100644 --- a/cmake/KokkosConfig.cmake.in +++ b/cmake/KokkosConfig.cmake.in @@ -39,10 +39,12 @@ IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) GLOBAL CHECK_CUDA_COMPILES) -ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +ELSEIF(@Kokkos_ENABLE_CUDA@ + AND NOT @KOKKOS_COMPILE_LANGUAGE@ STREQUAL CUDA + AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) # - # if CUDA was enabled, separable compilation was not specified, and current compiler - # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # if CUDA was enabled, the compilation language was not set to CUDA, and separable compilation was not + # specified, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, # otherwise, the original command will be executed From c75d730d203077bf79439c9b3f4a33cbbc18c6a9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 12 Jan 2024 16:00:30 -0500 Subject: [PATCH 213/432] Deprecate `{Cuda,HIP}::detect_device_count()` and `Cuda::[detect_]device_arch()` (#6710) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CUDA/HIP: Inline getting device count get_device_count() -> int cannot be generic. Get rid of the extra indirection because it brings no benefit. * Get rid of CudaInternalDevices and cleanup Cuda::print_configuration() * Get rid of cuda_get_device_{count,properties}_wrapper() * Missed a few CudaInternalDevices and get rid of Cuda::detect_device_arch() * Get rid of Cuda::device_arch() * Fixup Co-authored-by: Bruno Turcksin * Donā€™t mess with Voodoo * Be more conservative and deprecate before removing * Clang-format for suggestion made on GH * Remove stray const qualifier * Forgot that device_arch() was static * Who Let the Bugs Out?? * The same thing we do every night, Pinky - try to take over the world! * this should fix it --------- Co-authored-by: Bruno Turcksin --- core/src/Cuda/Kokkos_Cuda.hpp | 25 ++++- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 149 +++---------------------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 13 --- core/src/HIP/Kokkos_HIP.hpp | 10 +- core/src/HIP/Kokkos_HIP_Instance.cpp | 8 -- core/src/impl/Kokkos_Core.cpp | 8 +- 6 files changed, 50 insertions(+), 163 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 3e2c6c7f928..90e982474df 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -178,18 +178,37 @@ class Cuda { //! Initialize, telling the CUDA run-time library which device to use. static void impl_initialize(InitializationSettings const&); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Cuda device architecture of the selected device. /// /// This matches the __CUDA_ARCH__ specification. - static size_type device_arch(); + KOKKOS_DEPRECATED static size_type device_arch() { + const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop(); + return cudaProp.major * 100 + cudaProp.minor; + } //! Query device count. - static size_type detect_device_count(); + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; + } /** \brief Detect the available devices and their architecture * as defined by the __CUDA_ARCH__ specification. */ - static std::vector detect_device_arch(); + KOKKOS_DEPRECATED static std::vector detect_device_arch() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + std::vector out; + for (int i = 0; i < count; ++i) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + out.push_back(prop.major * 100 + prop.minor); + } + return out; + } +#endif cudaStream_t cuda_stream() const; int cuda_device() const; diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 804505d0d74..134522dc1c1 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -208,96 +208,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file, host_abort(out.str().c_str()); } -//---------------------------------------------------------------------------- -// Some significant cuda device properties: -// -// cudaDeviceProp::name : Text label for device -// cudaDeviceProp::major : Device major number -// cudaDeviceProp::minor : Device minor number -// cudaDeviceProp::warpSize : number of threads per warp -// cudaDeviceProp::multiProcessorCount : number of multiprocessors -// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block -// cudaDeviceProp::totalConstMem : capacity of constant memory -// cudaDeviceProp::totalGlobalMem : capacity of global memory -// cudaDeviceProp::maxGridSize[3] : maximum grid size - -// -// Section 4.4.2.4 of the CUDA Toolkit Reference Manual -// -// struct cudaDeviceProp { -// char name[256]; -// size_t totalGlobalMem; -// size_t sharedMemPerBlock; -// int regsPerBlock; -// int warpSize; -// size_t memPitch; -// int maxThreadsPerBlock; -// int maxThreadsDim[3]; -// int maxGridSize[3]; -// size_t totalConstMem; -// int major; -// int minor; -// int clockRate; -// size_t textureAlignment; -// int deviceOverlap; -// int multiProcessorCount; -// int kernelExecTimeoutEnabled; -// int integrated; -// int canMapHostMemory; -// int computeMode; -// int concurrentKernels; -// int ECCEnabled; -// int pciBusID; -// int pciDeviceID; -// int tccDriver; -// int asyncEngineCount; -// int unifiedAddressing; -// int memoryClockRate; -// int memoryBusWidth; -// int l2CacheSize; -// int maxThreadsPerMultiProcessor; -// }; - -namespace { - -class CudaInternalDevices { - public: - enum { MAXIMUM_DEVICE_COUNT = 64 }; - struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT]; - int m_cudaDevCount; - - CudaInternalDevices(); - - static const CudaInternalDevices &singleton(); -}; - -CudaInternalDevices::CudaInternalDevices() { - // See 'cudaSetDeviceFlags' for host-device thread interaction - // Section 4.4.2.6 of the CUDA Toolkit Reference Manual - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_count_wrapper( - &m_cudaDevCount))); - - if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { - Kokkos::abort( - "Sorry, you have more GPUs per node than we thought anybody would ever " - "have. Please report this to github.com/kokkos/kokkos."); - } - for (int i = 0; i < m_cudaDevCount; ++i) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_properties_wrapper( - m_cudaProp + i, i))); - } -} - -const CudaInternalDevices &CudaInternalDevices::singleton() { - static CudaInternalDevices self; - return self; -} - -} // namespace - //---------------------------------------------------------------------------- int Impl::CudaInternal::concurrency() { @@ -307,8 +217,6 @@ int Impl::CudaInternal::concurrency() { } void CudaInternal::print_configuration(std::ostream &s) const { - const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); - #if defined(KOKKOS_ENABLE_CUDA) s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif @@ -317,15 +225,19 @@ void CudaInternal::print_configuration(std::ostream &s) const { << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif - for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { - s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name - << " capability " << dev_info.m_cudaProp[i].major << "." - << dev_info.m_cudaProp[i].minor << ", Total Global Memory: " - << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + + for (int i = 0; i < count; ++i) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability " + << prop.major << "." << prop.minor + << ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem) << ", Shared Memory per Block: " - << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock); + << human_memory_size(prop.sharedMemPerBlock); if (m_cudaDev == i) s << " : Selected"; - s << std::endl; + s << '\n'; } } @@ -666,10 +578,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance, namespace Kokkos { -Cuda::size_type Cuda::detect_device_count() { - return Impl::CudaInternalDevices::singleton().m_cudaDevCount; -} - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int Cuda::concurrency() { #else @@ -684,11 +592,11 @@ int Cuda::impl_is_initialized() { void Cuda::impl_initialize(InitializationSettings const &settings) { const int cuda_device_id = Impl::get_gpu(settings); - const auto &dev_info = Impl::CudaInternalDevices::singleton(); - - const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; - Impl::CudaInternal::m_deviceProp = cudaProp; + cudaDeviceProp cudaProp; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGetDeviceProperties(&cudaProp, cuda_device_id)); + Impl::CudaInternal::m_deviceProp = cudaProp; KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); @@ -765,33 +673,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default /*manage*/ true); } -std::vector Cuda::detect_device_arch() { - const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton(); - - std::vector output(s.m_cudaDevCount); - - for (int i = 0; i < s.m_cudaDevCount; ++i) { - output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor; - } - - return output; -} - -Cuda::size_type Cuda::device_arch() { - const int dev_id = Impl::CudaInternal::singleton().m_cudaDev; - - int dev_arch = 0; - - if (0 <= dev_id) { - const struct cudaDeviceProp &cudaProp = - Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id]; - - dev_arch = cudaProp.major * 100 + cudaProp.minor; - } - - return dev_arch; -} - void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } Cuda::Cuda() diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 29e4e3f9679..e68dca6b900 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -265,19 +265,6 @@ class CudaInternal { return cudaFreeHost(ptr); } - template - cudaError_t cuda_get_device_count_wrapper(int* count) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceCount(count); - } - - template - cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop, - int device) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceProperties(prop, device); - } - template const char* cuda_get_error_name_wrapper(cudaError_t error) const { if constexpr (setCudaDevice) set_cuda_device(); diff --git a/core/src/HIP/Kokkos_HIP.hpp b/core/src/HIP/Kokkos_HIP.hpp index 61ed346b218..11beb48852c 100644 --- a/core/src/HIP/Kokkos_HIP.hpp +++ b/core/src/HIP/Kokkos_HIP.hpp @@ -94,9 +94,13 @@ class HIP { static int impl_is_initialized(); - // static size_type device_arch(); - - static size_type detect_device_count(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 7f04eb721cb..d1193d53a2b 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -420,12 +420,4 @@ void Kokkos::Impl::create_HIP_instances(std::vector &instances) { } } -//---------------------------------------------------------------------------- - -namespace Kokkos { -HIP::size_type HIP::detect_device_count() { - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); - return hipDevCount; -} } // namespace Kokkos diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 5c182db5663..0d10819c7d7 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -131,9 +131,13 @@ void combine(Kokkos::Tools::InitArguments& out, int get_device_count() { #if defined(KOKKOS_ENABLE_CUDA) - return Kokkos::Cuda::detect_device_count(); + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_HIP) - return Kokkos::HIP::detect_device_count(); + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_SYCL) return sycl::device::get_devices(sycl::info::device_type::gpu).size(); #elif defined(KOKKOS_ENABLE_OPENACC) From 868e42e7beb8a49f06b5c7c505c1e1ab5f9eb1e4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 12 Jan 2024 15:57:41 -0500 Subject: [PATCH 214/432] Get rid of CudaInternal::cuda_get_error_{name,string}_wrapper --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 12 ++++-------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 12 ------------ 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 134522dc1c1..18a35b04a48 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -183,10 +183,8 @@ void cuda_stream_synchronize( void cuda_internal_error_throw(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -196,10 +194,8 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file, void cuda_internal_error_abort(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index e68dca6b900..52a4d5f5d95 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -265,18 +265,6 @@ class CudaInternal { return cudaFreeHost(ptr); } - template - const char* cuda_get_error_name_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorName(error); - } - - template - const char* cuda_get_error_string_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorString(error); - } - template cudaError_t cuda_get_last_error_wrapper() const { if constexpr (setCudaDevice) set_cuda_device(); From e6ff1a46969737a0508df86799c93b4051faa6de Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 12 Jan 2024 15:59:09 -0500 Subject: [PATCH 215/432] No need to jump through so many hoops to print the error message --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 18a35b04a48..968383f6852 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -600,10 +600,8 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); if (Impl::CudaInternal::m_cudaArch == 0) { - std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; - std::string msg = ss.str(); - Kokkos::abort(msg.c_str()); + Kokkos::abort( + "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"); } int compiled_major = Impl::CudaInternal::m_cudaArch / 100; From 4e33b3bf9d73aeebe5e0897eeacb651e6a498d6a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Sat, 13 Jan 2024 20:45:30 -0500 Subject: [PATCH 216/432] HIP: Forgot to delete matching brace closing the namespace --- core/src/HIP/Kokkos_HIP_Instance.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index d1193d53a2b..5b9b1a5a41a 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -419,5 +419,3 @@ void Kokkos::Impl::create_HIP_instances(std::vector &instances) { instances[s] = HIP(stream, ManageStream::yes); } } - -} // namespace Kokkos From 35a867d374227522e44c5240e18365ac0635980a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 16 Jan 2024 12:45:47 -0500 Subject: [PATCH 217/432] Make initialize and finalize of the Cuda/HIP singleton less special (#6714) Make initialization of the Cuda/HIP singleton less special --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 64 ++++++++++++-------------- core/src/HIP/Kokkos_HIP.cpp | 13 +++++- core/src/HIP/Kokkos_HIP_Instance.cpp | 8 ---- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 968383f6852..b9d332a1107 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -317,22 +317,6 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - // Init the array for used for arbitrarily sized atomics - if (this == &singleton()) { - desul::Impl::init_lock_arrays(); // FIXME - } - - // Allocate a staging buffer for constant mem in pinned host memory - // and an event to avoid overwriting driver for previous kernel launches - if (this == &singleton()) { - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( - reinterpret_cast(&constantMemHostStaging), - CudaTraits::ConstantMemoryUsage))); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_create_wrapper(&constantMemReusable))); - } - m_stream = stream; m_manage_stream = manage_stream; for (int i = 0; i < m_n_team_scratch; ++i) { @@ -496,23 +480,6 @@ void CudaInternal::finalize() { was_finalized = true; - // Only finalize this if we're the singleton - if (this == &singleton()) { - (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_free_host_wrapper(constantMemHostStaging))); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_destroy_wrapper(constantMemReusable))); - auto &deep_copy_space = - Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); - if (deep_copy_space) - deep_copy_space->impl_internal_space_instance()->finalize(); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_stream_destroy_wrapper(cuda_get_deep_copy_stream()))); - } - if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { using RecordCuda = Kokkos::Impl::SharedAllocationRecord; using RecordHost = @@ -663,11 +630,40 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); + // Init the array for used for arbitrarily sized atomics + desul::Impl::init_lock_arrays(); // FIXME + + // Allocate a staging buffer for constant mem in pinned host memory and an + // event to avoid overwriting driver for previous kernel launches + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMallocHost( + reinterpret_cast(&Impl::CudaInternal::constantMemHostStaging), + Impl::CudaTraits::ConstantMemoryUsage)); + + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaEventCreate(&Impl::CudaInternal::constantMemReusable)); + Impl::CudaInternal::singleton().initialize(singleton_stream, /*manage*/ true); } -void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } +void Cuda::impl_finalize() { + (void)Impl::cuda_global_unique_token_locks(true); + + desul::Impl::finalize_lock_arrays(); // FIXME + + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaEventDestroy(Impl::CudaInternal::constantMemReusable)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Impl::CudaInternal::constantMemHostStaging)); + + auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::cuda_get_deep_copy_stream())); + + Impl::CudaInternal::singleton().finalize(); +} Cuda::Cuda() : m_space_instance(&Impl::CudaInternal::singleton(), diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index f78bfd28b2f..8d13866df40 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -92,7 +92,18 @@ void HIP::impl_initialize(InitializationSettings const& settings) { Impl::HIPInternal::singleton().initialize(singleton_stream, /*manage*/ true); } -void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } +void HIP::impl_finalize() { + (void)Impl::hip_global_unique_token_locks(true); + + desul::Impl::finalize_lock_arrays(); // FIXME + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipEventDestroy(Impl::HIPInternal::constantMemReusable)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipHostFree(Impl::HIPInternal::constantMemHostStaging)); + + Impl::HIPInternal::singleton().finalize(); +} HIP::HIP() : m_space_instance(&Impl::HIPInternal::singleton(), diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 5b9b1a5a41a..c33d00ad6b8 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -323,14 +323,6 @@ void HIPInternal::finalize() { this->fence("Kokkos::HIPInternal::finalize: fence on finalization"); was_finalized = true; - if (this == &singleton()) { - (void)Kokkos::Impl::hip_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); - } - if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { using RecordHIP = Kokkos::Impl::SharedAllocationRecord; From 256c0ca62e56216ebd6351b3ed2c7bdacef8c654 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 16 Jan 2024 16:20:13 -0700 Subject: [PATCH 218/432] Kokkos_HIP.cpp: include Kokkos_Core.hpp to resolve errors Resolves errors of form: /home/jenkins/caraway-new/workspace/KokkosKernels_PullRequest_VEGA90A_ROCM560/kokkos/core/src/HIP/Kokkos_HIP.cpp:96:15: error: no member named 'hip_global_unique_token_locks' in namespace 'Kokkos::Impl' (void)Impl::hip_global_unique_token_locks(true); Note: include of Kokkos_HIP_UniqueToken.hpp was insufficient as this triggered new errors: /home/ndellin/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp:40:29: error: implicit instantiation of undefined template 'Kokkos::View' View m_locks; --- core/src/HIP/Kokkos_HIP.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index 8d13866df40..1508ef18a38 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -18,6 +18,7 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif +#include #include #include From 179d2e67fd42eeb894cf9b5e976104a13de990c2 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Wed, 17 Jan 2024 15:20:12 -0700 Subject: [PATCH 219/432] Add bound checks in RangePolicy and MDRangePolicy (#6617) * Added a bounds check in MDRangePolicy that checks that all lower bounds are less than its upper bound * Modified the wording on the abort * Converted the error msg from a stringstream to a string * Modified abort msg * Fixed the unit test output based on backend's default iterate direction * Update core/unit_test/TestMDRangePolicyConstructors.hpp Formatting. Co-authored-by: Damien L-G * Updated RangePolicy to have the same precondition as MDRangePolicy --------- Co-authored-by: Damien L-G --- core/src/KokkosExp_MDRangePolicy.hpp | 10 +++++++++ core/src/Kokkos_ExecPolicy.hpp | 22 +++++++++++++++---- .../TestMDRangePolicyConstructors.hpp | 16 ++++++++++++++ .../unit_test/TestRangePolicyConstructors.hpp | 13 +++++++++++ 4 files changed, 57 insertions(+), 4 deletions(-) diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp index d0ae7fdcea5..6810cf329c4 100644 --- a/core/src/KokkosExp_MDRangePolicy.hpp +++ b/core/src/KokkosExp_MDRangePolicy.hpp @@ -327,6 +327,16 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; + + if (m_upper[i] < m_lower[i]) { + std::string msg = + "Kokkos::MDRangePolicy bounds error: The lower bound (" + + std::to_string(m_lower[i]) + ") is greater than its upper bound (" + + std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) + + "."; + Kokkos::abort(msg.c_str()); + } + if (m_tile[i] <= 0) { m_tune_tile_size = true; if ((inner_direction == Iterate::Right && (i < rank - 1)) || diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index 389dcf19368..ef33fb0b1b8 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -117,16 +117,18 @@ class RangePolicy : public Impl::PolicyTraits { inline RangePolicy(const typename traits::execution_space& work_space, const member_type work_begin, const member_type work_end) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_bounds_validity(); set_auto_chunk_size(); } /** \brief Total range */ inline RangePolicy(const member_type work_begin, const member_type work_end) : RangePolicy(typename traits::execution_space(), work_begin, work_end) { + check_bounds_validity(); set_auto_chunk_size(); } @@ -136,10 +138,11 @@ class RangePolicy : public Impl::PolicyTraits { const member_type work_begin, const member_type work_end, Args... args) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_bounds_validity(); set_auto_chunk_size(); set(args...); } @@ -149,6 +152,7 @@ class RangePolicy : public Impl::PolicyTraits { inline RangePolicy(const member_type work_begin, const member_type work_end, Args... args) : RangePolicy(typename traits::execution_space(), work_begin, work_end) { + check_bounds_validity(); set_auto_chunk_size(); set(args...); } @@ -218,6 +222,16 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask = m_granularity - 1; } + inline void check_bounds_validity() { + if (m_end < m_begin) { + std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" + + std::to_string(m_begin) + + ") is greater than the upper bound (" + + std::to_string(m_end) + ")."; + Kokkos::abort(msg.c_str()); + } + } + public: /** \brief Subrange for a partition's rank and size. * diff --git a/core/unit_test/TestMDRangePolicyConstructors.hpp b/core/unit_test/TestMDRangePolicyConstructors.hpp index f577f415e7c..dfde75ce789 100644 --- a/core/unit_test/TestMDRangePolicyConstructors.hpp +++ b/core/unit_test/TestMDRangePolicyConstructors.hpp @@ -93,6 +93,22 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) { }, "unsafe narrowing conversion"); } + +TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) { + using Policy = Kokkos::MDRangePolicy>; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + auto dim = (Policy::inner_direction == Kokkos::Iterate::Right) ? 1 : 0; + + ASSERT_DEATH( + { + (void)Policy({100, 100}, {90, 90}); + }, + "Kokkos::MDRangePolicy bounds error: The lower bound \\(100\\) is " + "greater than its upper bound \\(90\\) in dimension " + + std::to_string(dim) + "\\."); +} #endif } // namespace diff --git a/core/unit_test/TestRangePolicyConstructors.hpp b/core/unit_test/TestRangePolicyConstructors.hpp index 0a7e59ed980..6c8c69f2166 100644 --- a/core/unit_test/TestRangePolicyConstructors.hpp +++ b/core/unit_test/TestRangePolicyConstructors.hpp @@ -70,4 +70,17 @@ TEST(TEST_CATEGORY, range_policy_runtime_parameters) { } } +TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) { + using Policy = Kokkos::RangePolicy; + using ChunkSize = Kokkos::ChunkSize; + + ASSERT_DEATH({ (void)Policy(100, 90); }, + "Kokkos::RangePolicy bounds error: The lower bound \\(100\\) is " + "greater than the upper bound \\(90\\)\\."); + + ASSERT_DEATH({ (void)Policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); }, + "Kokkos::RangePolicy bounds error: The lower bound \\(100\\) is " + "greater than the upper bound \\(90\\)\\."); +} + } // namespace From f42a8cb0326df4cb68c3e9da87842d4403799386 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 18 Jan 2024 10:26:09 -0500 Subject: [PATCH 220/432] Temporary fix to reenable HIP CI --- .jenkins | 152 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 78 insertions(+), 74 deletions(-) diff --git a/.jenkins b/.jenkins index ebe20ece8c3..76be63fa574 100644 --- a/.jenkins +++ b/.jenkins @@ -135,82 +135,86 @@ pipeline { } } } -// stage('HIP-ROCm-5.2') { -// agent { -// dockerfile { -// filename 'Dockerfile.hipcc' -// dir 'scripts/docker' -// additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' -// label 'rocm-docker && vega' -// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' -// } -// } -// environment { -// OMP_NUM_THREADS = 8 -// OMP_MAX_ACTIVE_LEVELS = 3 -// OMP_PLACES = 'threads' -// OMP_PROC_BIND = 'spread' -// } -// steps { -// sh 'ccache --zero-stats' -// sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' -// sh '''rm -rf build && mkdir -p build && cd build && \ -// cmake \ -// -DCMAKE_BUILD_TYPE=Debug \ -// -DCMAKE_CXX_COMPILER=hipcc \ -// -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \ -// -DCMAKE_CXX_STANDARD=17 \ -// -DKokkos_ARCH_NATIVE=ON \ -// -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -// -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -// -DKokkos_ENABLE_TESTS=ON \ -// -DKokkos_ENABLE_BENCHMARKS=ON \ -// -DKokkos_ENABLE_HIP=ON \ -// -DKokkos_ENABLE_OPENMP=ON \ -// -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ -// .. && \ + stage('HIP-ROCm-5.2') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' + label 'nvidia-docker && ampere' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + environment { + OMP_NUM_THREADS = 8 + OMP_MAX_ACTIVE_LEVELS = 3 + OMP_PLACES = 'threads' + OMP_PROC_BIND = 'spread' + } + steps { + sh 'ccache --zero-stats' + sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ARCH_AMD_GFX90A=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ + .. && \ + make -j8''' // make -j8 && ctest --verbose''' -// } -// post { -// always { -// sh 'ccache --show-stats' -// } -// } -// } -// stage('HIP-ROCm-5.6-C++20') { -// agent { -// dockerfile { -// filename 'Dockerfile.hipcc' -// dir 'scripts/docker' -// additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' -// label 'rocm-docker && vega' -// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' -// } -// } -// steps { -// sh 'ccache --zero-stats' -// sh '''rm -rf build && mkdir -p build && cd build && \ -// cmake \ -// -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -// -DCMAKE_CXX_COMPILER=hipcc \ -// -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ -// -DCMAKE_CXX_STANDARD=20 \ -// -DKokkos_ARCH_NATIVE=ON \ -// -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -// -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -// -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -// -DKokkos_ENABLE_TESTS=ON \ -// -DKokkos_ENABLE_BENCHMARKS=ON \ -// -DKokkos_ENABLE_HIP=ON \ -// .. && \ + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('HIP-ROCm-5.6-C++20') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' + label 'nvidia-docker && ampere' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ARCH_AMD_GFX90A=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8''' // make -j8 && ctest --verbose''' -// } -// post { -// always { -// sh 'ccache --show-stats' -// } -// } -// } + } + post { + always { + sh 'ccache --show-stats' + } + } + } /* stage('OPENMPTARGET-ROCm-5.2') { agent { From 86f5bb7d892b74defdcde7da8866bb9a5d5afbd4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 Jan 2024 12:18:48 -0500 Subject: [PATCH 221/432] Let the smart pointer manage the CUDA/HIP stream (#6721) * Let the smart pointer manage the CUDA/HIP stream * Fixup do not null the stream in {Cuda/HIP}Internal::finalize() * Fixup hip not tested either * Not breaking HIP backend for 3rd time in a row Co-authored-by: Bruno Turcksin --------- Co-authored-by: Bruno Turcksin --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 30 +++++++++++++------------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 3 +-- core/src/HIP/Kokkos_HIP.cpp | 18 ++++++++++------ core/src/HIP/Kokkos_HIP_Instance.cpp | 9 ++------ core/src/HIP/Kokkos_HIP_Instance.hpp | 3 +-- 5 files changed, 31 insertions(+), 32 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index b9d332a1107..0e34ccc67c9 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -240,7 +240,7 @@ void CudaInternal::print_configuration(std::ostream &s) const { //---------------------------------------------------------------------------- CudaInternal::~CudaInternal() { - if (m_stream || m_scratchSpace || m_scratchFlags || m_scratchUnified) { + if (m_scratchSpace || m_scratchFlags || m_scratchUnified) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; } @@ -278,7 +278,7 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { +void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) @@ -317,8 +317,7 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - m_stream = stream; - m_manage_stream = manage_stream; + m_stream = stream; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -497,16 +496,12 @@ void CudaInternal::finalize() { Kokkos::kokkos_free(m_team_scratch_ptr[i]); } - if (m_manage_stream && get_stream() != nullptr) - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_stream_destroy_wrapper(m_stream))); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchUnifiedCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; m_scratchUnified = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -642,8 +637,7 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default KOKKOS_IMPL_CUDA_SAFE_CALL( cudaEventCreate(&Impl::CudaInternal::constantMemReusable)); - Impl::CudaInternal::singleton().initialize(singleton_stream, - /*manage*/ true); + Impl::CudaInternal::singleton().initialize(singleton_stream); } void Cuda::impl_finalize() { @@ -663,6 +657,8 @@ void Cuda::impl_finalize() { cudaStreamDestroy(Impl::cuda_get_deep_copy_stream())); Impl::CudaInternal::singleton().finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::CudaInternal::singleton().m_stream)); } Cuda::Cuda() @@ -677,13 +673,17 @@ KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream) manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {} Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::CudaInternal, [manage_stream](Impl::CudaInternal *ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 52a4d5f5d95..9c452573a51 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -104,7 +104,6 @@ class CudaInternal { mutable size_type* m_scratchFunctor; cudaStream_t m_stream; uint32_t m_instance_id; - bool m_manage_stream; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -131,7 +130,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(cudaStream_t stream, bool manage_stream); + void initialize(cudaStream_t stream); void finalize(); void print_configuration(std::ostream&) const; diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index 1508ef18a38..2a6bfea1d64 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -90,7 +90,7 @@ void HIP::impl_initialize(InitializationSettings const& settings) { hipStream_t singleton_stream; KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&singleton_stream)); - Impl::HIPInternal::singleton().initialize(singleton_stream, /*manage*/ true); + Impl::HIPInternal::singleton().initialize(singleton_stream); } void HIP::impl_finalize() { @@ -104,6 +104,8 @@ void HIP::impl_finalize() { hipHostFree(Impl::HIPInternal::constantMemHostStaging)); Impl::HIPInternal::singleton().finalize(); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipStreamDestroy(Impl::HIPInternal::singleton().m_stream)); } HIP::HIP() @@ -114,13 +116,17 @@ HIP::HIP() } HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::HIPInternal, [manage_stream](Impl::HIPInternal* ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index c33d00ad6b8..d441c59c212 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -159,14 +159,13 @@ void HIPInternal::fence(const std::string &name) const { [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); }); } -void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { +void HIPInternal::initialize(hipStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); - m_stream = stream; - m_manage_stream = manage_stream; + m_stream = stream; //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -340,14 +339,10 @@ void HIPInternal::finalize() { Kokkos::kokkos_free(m_team_scratch_ptr[i]); } - if (m_manage_stream && m_stream != nullptr) - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream)); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 63ad66686bb..142008124af 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -98,7 +98,6 @@ class HIPInternal { uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); - bool m_manage_stream = false; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -124,7 +123,7 @@ class HIPInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(hipStream_t stream, bool manage_stream); + void initialize(hipStream_t stream); void finalize(); void print_configuration(std::ostream &) const; From 9eca17795dfc38564aaf53f5e26899e4f89d114d Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 18 Jan 2024 13:39:18 -0500 Subject: [PATCH 222/432] Fix Docker env variables --- .jenkins | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.jenkins b/.jenkins index 76be63fa574..da21a7096ee 100644 --- a/.jenkins +++ b/.jenkins @@ -142,7 +142,8 @@ pipeline { dir 'scripts/docker' additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' label 'nvidia-docker && ampere' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' +// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } environment { @@ -153,7 +154,6 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Debug \ @@ -186,7 +186,8 @@ pipeline { dir 'scripts/docker' additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' label 'nvidia-docker && ampere' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' +// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } steps { From 8f743cf95c4718abaa65fcca42e0e4f5c55c647f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 19 Jan 2024 08:27:06 -0500 Subject: [PATCH 223/432] Ensure view_allocation_error does not silently ignore that no exception was thrown --- core/unit_test/TestViewAPI.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp index a8492523a1e..ca098dbc247 100644 --- a/core/unit_test/TestViewAPI.hpp +++ b/core/unit_test/TestViewAPI.hpp @@ -1553,6 +1553,7 @@ class TestViewAPI { Kokkos::CudaUVMSpace>::value) return; #endif + bool did_throw = false; auto alloc_size = std::numeric_limits::max() - 42; try { auto should_always_fail = dView1("hello_world_failure", alloc_size); @@ -1584,7 +1585,9 @@ class TestViewAPI { "because of an unknown error.", msg); } #endif + did_throw = true; } + ASSERT_TRUE(did_throw); } }; From a3aa567af5d335888b65cfdf1948997dec21d69d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 19 Jan 2024 08:28:36 -0500 Subject: [PATCH 224/432] Add RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc enumerator --- core/src/impl/Kokkos_Error.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/impl/Kokkos_Error.hpp b/core/src/impl/Kokkos_Error.hpp index 3d0b1d3274c..97b171c477b 100644 --- a/core/src/impl/Kokkos_Error.hpp +++ b/core/src/impl/Kokkos_Error.hpp @@ -58,7 +58,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc { HIPMallocManaged, SYCLMallocDevice, SYCLMallocShared, - SYCLMallocHost + SYCLMallocHost, + OpenACCMalloc, }; private: From f9f3c6e13cde62016b42f856ccc96fe4a8771a28 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 19 Jan 2024 08:30:26 -0500 Subject: [PATCH 225/432] [OpenACC] throw if acc_malloc returned nullptr --- core/src/OpenACC/Kokkos_OpenACCSpace.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index 141ec77fd1f..8508a84d6ed 100644 --- a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -66,6 +67,17 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate( ptr = acc_malloc(arg_alloc_size); + if (!ptr) { + size_t alignment = 1; // OpenACC does not handle alignment + using Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode; + auto failure_mode = arg_alloc_size > 0 ? FailureMode::OutOfMemoryError + : FailureMode::InvalidAllocationSize; + using Kokkos::Experimental::RawMemoryAllocationFailure::AllocationMechanism; + auto alloc_mechanism = AllocationMechanism::OpenACCMalloc; + throw Kokkos::Experimental::RawMemoryAllocationFailure( + arg_alloc_size, alignment, failure_mode, alloc_mechanism); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; From 3d33665ffc72c8d7a50ba9ffdbb3972530371de3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 19 Jan 2024 09:55:31 -0500 Subject: [PATCH 226/432] Fixup using declaration --- core/src/OpenACC/Kokkos_OpenACCSpace.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index 8508a84d6ed..372d8069c99 100644 --- a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -69,13 +69,15 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate( if (!ptr) { size_t alignment = 1; // OpenACC does not handle alignment - using Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode; - auto failure_mode = arg_alloc_size > 0 ? FailureMode::OutOfMemoryError - : FailureMode::InvalidAllocationSize; - using Kokkos::Experimental::RawMemoryAllocationFailure::AllocationMechanism; - auto alloc_mechanism = AllocationMechanism::OpenACCMalloc; - throw Kokkos::Experimental::RawMemoryAllocationFailure( - arg_alloc_size, alignment, failure_mode, alloc_mechanism); + using Kokkos::Experimental::RawMemoryAllocationFailure; + auto failure_mode = + arg_alloc_size > 0 + ? RawMemoryAllocationFailure::FailureMode::OutOfMemoryError + : RawMemoryAllocationFailure::FailureMode::InvalidAllocationSize; + auto alloc_mechanism = + RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc; + throw RawMemoryAllocationFailure(arg_alloc_size, alignment, failure_mode, + alloc_mechanism); } if (Kokkos::Profiling::profileLibraryLoaded()) { From 5781d176e8ce86dcc5605c0e668d850a4592bdd1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 19 Jan 2024 13:43:03 -0500 Subject: [PATCH 227/432] Disable openacc.view_allocation_error test --- core/unit_test/TestViewAPI_d.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/unit_test/TestViewAPI_d.hpp b/core/unit_test/TestViewAPI_d.hpp index b44335279b3..b0d759ffccc 100644 --- a/core/unit_test/TestViewAPI_d.hpp +++ b/core/unit_test/TestViewAPI_d.hpp @@ -35,6 +35,11 @@ TEST(TEST_CATEGORY, view_allocation_error) { #endif #if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same_v) { + GTEST_SKIP() << "acc_malloc() not properly returning nullptr"; + } #endif TestViewAPI::run_test_error(); } From 6912b3998d4722c6c19db1a5d110e15d882d0ff5 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 19 Jan 2024 20:48:44 -0500 Subject: [PATCH 228/432] Guard `[MD]RangePolicy` precondition check for deprecated code 4 (#6726) * Guard [MD]RangePolicy precondition check for deprecated code 4 * No good reason to use a raw string literal for the warning msg * Drop pointless inline specifier * Restore original behavior when deprecated code 4 is enabled * first blush * Fixup obviously wont display both message when aborting * Trust me, it's not badly written. It's just way above your head. * Add assertions for the old behavior * Fiddling with string comparison * Attempt to resolve death tests error msg matching issues * Fix that stupid regex --- core/src/KokkosExp_MDRangePolicy.hpp | 6 ++- core/src/Kokkos_ExecPolicy.hpp | 12 +++-- core/src/impl/Kokkos_Error.cpp | 12 ++++- core/src/impl/Kokkos_Error.hpp | 2 + .../TestMDRangePolicyConstructors.hpp | 43 +++++++++++++---- .../unit_test/TestRangePolicyConstructors.hpp | 48 +++++++++++++++++-- 6 files changed, 104 insertions(+), 19 deletions(-) diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp index 6810cf329c4..ff49c13cbad 100644 --- a/core/src/KokkosExp_MDRangePolicy.hpp +++ b/core/src/KokkosExp_MDRangePolicy.hpp @@ -333,8 +333,12 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { "Kokkos::MDRangePolicy bounds error: The lower bound (" + std::to_string(m_lower[i]) + ") is greater than its upper bound (" + std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) + - "."; + ".\n"; +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) Kokkos::abort(msg.c_str()); +#elif defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + Kokkos::Impl::log_warning(msg); +#endif } if (m_tile[i] <= 0) { diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index ef33fb0b1b8..025a2795fb3 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -128,7 +128,6 @@ class RangePolicy : public Impl::PolicyTraits { /** \brief Total range */ inline RangePolicy(const member_type work_begin, const member_type work_end) : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - check_bounds_validity(); set_auto_chunk_size(); } @@ -222,13 +221,20 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask = m_granularity - 1; } - inline void check_bounds_validity() { + void check_bounds_validity() { if (m_end < m_begin) { std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" + std::to_string(m_begin) + ") is greater than the upper bound (" + - std::to_string(m_end) + ")."; + std::to_string(m_end) + ").\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 Kokkos::abort(msg.c_str()); +#endif + m_begin = 0; + m_end = 0; +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif } } diff --git a/core/src/impl/Kokkos_Error.cpp b/core/src/impl/Kokkos_Error.cpp index 4babe2d72bd..de6e83ed1f2 100644 --- a/core/src/impl/Kokkos_Error.cpp +++ b/core/src/impl/Kokkos_Error.cpp @@ -21,10 +21,11 @@ #include #include -#include +#include #include #include #include +#include // show_warnings #include #include @@ -38,6 +39,12 @@ void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } +void log_warning(const std::string &msg) { + if (show_warnings()) { + std::cerr << msg << std::flush; + } +} + std::string human_memory_size(size_t arg_bytes) { double bytes = arg_bytes; const double K = 1024; @@ -64,7 +71,8 @@ std::string human_memory_size(size_t arg_bytes) { void Experimental::RawMemoryAllocationFailure::print_error_message( std::ostream &o) const { - o << "Allocation of size " << Impl::human_memory_size(m_attempted_size); + o << "Allocation of size " + << ::Kokkos::Impl::human_memory_size(m_attempted_size); o << " failed"; switch (m_failure_mode) { case FailureMode::OutOfMemoryError: diff --git a/core/src/impl/Kokkos_Error.hpp b/core/src/impl/Kokkos_Error.hpp index 97b171c477b..1058fd98dbf 100644 --- a/core/src/impl/Kokkos_Error.hpp +++ b/core/src/impl/Kokkos_Error.hpp @@ -28,6 +28,8 @@ namespace Impl { [[noreturn]] void throw_runtime_exception(const std::string &msg); +void log_warning(const std::string &msg); + std::string human_memory_size(size_t arg_bytes); } // namespace Impl diff --git a/core/unit_test/TestMDRangePolicyConstructors.hpp b/core/unit_test/TestMDRangePolicyConstructors.hpp index dfde75ce789..306f89413e0 100644 --- a/core/unit_test/TestMDRangePolicyConstructors.hpp +++ b/core/unit_test/TestMDRangePolicyConstructors.hpp @@ -18,6 +18,8 @@ #include +#include + namespace { template @@ -99,15 +101,40 @@ TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - auto dim = (Policy::inner_direction == Kokkos::Iterate::Right) ? 1 : 0; + auto [dim0, dim1] = (Policy::inner_direction == Kokkos::Iterate::Right) + ? std::make_pair(1, 0) + : std::make_pair(0, 1); + std::string msg1 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim0) + ".\n"; - ASSERT_DEATH( - { - (void)Policy({100, 100}, {90, 90}); - }, - "Kokkos::MDRangePolicy bounds error: The lower bound \\(100\\) is " - "greater than its upper bound \\(90\\) in dimension " + - std::to_string(dim) + "\\."); + std::string msg2 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim1) + ".\n"; + +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + // escape the parentheses in the regex to match the error message + msg1 = std::regex_replace(msg1, std::regex("\\(|\\)"), "\\$&"); + (void)msg2; + ASSERT_DEATH({ (void)Policy({100, 100}, {90, 90}); }, msg1); +#else + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + ::testing::internal::CaptureStderr(); + (void)Policy({100, 100}, {90, 90}); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg1 + msg2); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg1; + (void)msg2; +#endif + +#endif } #endif diff --git a/core/unit_test/TestRangePolicyConstructors.hpp b/core/unit_test/TestRangePolicyConstructors.hpp index 6c8c69f2166..88656813ff4 100644 --- a/core/unit_test/TestRangePolicyConstructors.hpp +++ b/core/unit_test/TestRangePolicyConstructors.hpp @@ -18,6 +18,8 @@ #include +#include + namespace { TEST(TEST_CATEGORY, range_policy_runtime_parameters) { @@ -74,13 +76,49 @@ TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) { using Policy = Kokkos::RangePolicy; using ChunkSize = Kokkos::ChunkSize; - ASSERT_DEATH({ (void)Policy(100, 90); }, - "Kokkos::RangePolicy bounds error: The lower bound \\(100\\) is " - "greater than the upper bound \\(90\\)\\."); + std::string msg = + "Kokkos::RangePolicy bounds error: The lower bound (100) is greater than " + "the upper bound (90).\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // escape the parentheses in the regex to match the error message + msg = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ASSERT_DEATH({ (void)Policy(100, 90); }, msg); ASSERT_DEATH({ (void)Policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); }, - "Kokkos::RangePolicy bounds error: The lower bound \\(100\\) is " - "greater than the upper bound \\(90\\)\\."); + msg); +#else + + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(100, 90); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + +#endif } } // namespace From bd3c0a552c423eb6f11f950cc68771ecc18b44b6 Mon Sep 17 00:00:00 2001 From: Simon Schlepphorst Date: Mon, 22 Jan 2024 09:24:40 +0100 Subject: [PATCH 229/432] Add C++26 standard to CMake Setup --- cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_pick_cxx_std.cmake | 1 + cmake/kokkos_test_cxx_std.cmake | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index f54475e45aa..76549a31195 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -32,6 +32,7 @@ #cmakedefine KOKKOS_ENABLE_CXX17 #cmakedefine KOKKOS_ENABLE_CXX20 #cmakedefine KOKKOS_ENABLE_CXX23 +#cmakedefine KOKKOS_ENABLE_CXX26 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM diff --git a/cmake/kokkos_pick_cxx_std.cmake b/cmake/kokkos_pick_cxx_std.cmake index d4eca651d42..ae14a10d531 100644 --- a/cmake/kokkos_pick_cxx_std.cmake +++ b/cmake/kokkos_pick_cxx_std.cmake @@ -7,6 +7,7 @@ KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INST SET(KOKKOS_ENABLE_CXX17 OFF) SET(KOKKOS_ENABLE_CXX20 OFF) SET(KOKKOS_ENABLE_CXX23 OFF) +SET(KOKKOS_ENABLE_CXX26 OFF) IF (KOKKOS_CXX_STANDARD) MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") ENDIF() diff --git a/cmake/kokkos_test_cxx_std.cmake b/cmake/kokkos_test_cxx_std.cmake index 7ad49fdd2d9..b075a3e36b5 100644 --- a/cmake/kokkos_test_cxx_std.cmake +++ b/cmake/kokkos_test_cxx_std.cmake @@ -74,6 +74,10 @@ ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") SET(KOKKOS_ENABLE_CXX23 ON) +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + kokkos_set_cxx_standard_feature(26) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + SET(KOKKOS_ENABLE_CXX26 ON) ELSE() MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") ENDIF() From 39a0f3d675f79841e2237de57c0c7efa702b663d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 22 Jan 2024 08:09:34 -0500 Subject: [PATCH 230/432] Add support for C++26 in generated makefiles --- Makefile.kokkos | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Makefile.kokkos b/Makefile.kokkos index 506e3339bfc..5598f19da2f 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -63,6 +63,8 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a) KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23) KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b) +KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26) +KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c) # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) @@ -563,6 +565,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23") endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) From 87f32846bb0ac950ae7b639e922369b5754ffb60 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 22 Jan 2024 08:10:54 -0500 Subject: [PATCH 231/432] Add KOKKOS_ENABLE_CXX26 to the configuration metadata --- core/src/impl/Kokkos_Core.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 0d10819c7d7..bcb794b11fe 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -608,6 +608,11 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX23", "no"); #endif +#ifdef KOKKOS_ENABLE_CXX26 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "no"); +#endif #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK", "yes"); From c4e1b86c888c0da33cd7129b2902bcc298cda10e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 19 Jan 2024 14:30:32 -0500 Subject: [PATCH 232/432] Reenable HIP testing --- .jenkins | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/.jenkins b/.jenkins index da21a7096ee..87053cbe833 100644 --- a/.jenkins +++ b/.jenkins @@ -141,9 +141,8 @@ pipeline { filename 'Dockerfile.hipcc' dir 'scripts/docker' additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' - label 'nvidia-docker && ampere' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' -// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + label 'rocm-docker ' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } environment { @@ -154,6 +153,7 @@ pipeline { } steps { sh 'ccache --zero-stats' + sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Debug \ @@ -161,7 +161,6 @@ pipeline { -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \ -DCMAKE_CXX_STANDARD=17 \ -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ARCH_AMD_GFX90A=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_TESTS=ON \ @@ -170,8 +169,7 @@ pipeline { -DKokkos_ENABLE_OPENMP=ON \ -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ .. && \ - make -j8''' -// make -j8 && ctest --verbose''' + make -j8 && ctest --verbose''' } post { always { @@ -185,9 +183,8 @@ pipeline { filename 'Dockerfile.hipcc' dir 'scripts/docker' additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' - label 'nvidia-docker && ampere' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' -// args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + label 'rocm-docker' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } steps { @@ -199,7 +196,6 @@ pipeline { -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ -DCMAKE_CXX_STANDARD=20 \ -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ARCH_AMD_GFX90A=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ @@ -207,8 +203,7 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ .. && \ - make -j8''' -// make -j8 && ctest --verbose''' + make -j8 && ctest --verbose''' } post { always { From 523d70189dd7a04cb2d75dbe7e23cbb904fb578f Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 22 Jan 2024 09:20:21 -0500 Subject: [PATCH 233/432] Disabling failing HIP test in the CI --- core/unit_test/hip/TestHIP_Memory_Requirements.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp index 8c72e9f2972..a213453ea18 100644 --- a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp +++ b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp @@ -48,6 +48,9 @@ TEST(hip, memory_requirements) { // we want all user-facing memory in hip to be coarse grained. As of // today(07.01.22) the documentation is not reliable/correct, we test the // memory on the device and host + // FIXME_HIP + GTEST_SKIP() << "skipping the test because the CI on MI100 returns: error( " + "hipErrorInvalidValue)"; KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10); From 407e18dc8c652ff6d6c6c6796e2866891580ab4f Mon Sep 17 00:00:00 2001 From: Thomas Conrad Clevenger Date: Mon, 22 Jan 2024 07:42:09 -0700 Subject: [PATCH 234/432] Use team_size_max to fix "Team size too large" error in reducer test (#6725) * Fix test to not use team size too large For some configurations, one of the reducer test was setting a team size too large. * Remove team_size calculation from test not using it * Remove unused typedef * Use int in for-loop to avoid warning Co-authored-by: Damien L-G * Fix teamsize request for OPENMPTARGET * Cleanup test and match TestTeam.hpp Use 32 as a minimum team size like in TestTeam.hpp. Better match layout in the 3 tests here. --------- Co-authored-by: Damien L-G --- core/unit_test/TestReducers.hpp | 119 ++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 36 deletions(-) diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index f710c40d3a3..fbcb9629af0 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -56,6 +56,28 @@ struct TestReducers { } }; + struct TeamSumNestedFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + SumFunctor f; + int M, N; + Kokkos::View result; + + TeamSumNestedFunctor(SumFunctor& f_, const int M_, const int N_, + Kokkos::View result_) + : f(f_), M(M_), N(N_), result(result_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m) const { + const int i = m.league_rank(); + Scalar local_scalar; + Kokkos::Sum reducer_scalar( + local_scalar); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(m, N), f, reducer_scalar); + result(i) = local_scalar; + } + }; + struct ProdFunctor { Kokkos::View values; @@ -351,53 +373,78 @@ struct TestReducers { } #endif - using member_type = typename Kokkos::TeamPolicy::member_type; - Scalar sum_scalar; Kokkos::View sum_view("result"); Kokkos::deep_copy(sum_view, Scalar(1)); - constexpr int num_teams = get_num_teams(); - TeamSumFunctor tf; + // Test team policy reduction + { + constexpr int num_teams = get_num_teams(); + TeamSumFunctor tf; + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 #ifdef KOKKOS_ENABLE_OPENMPTARGET - auto team_pol = Kokkos::TeamPolicy(num_teams, Kokkos::AUTO); + int team_size = + std::is_same::value + ? 32 + : 1; #else - auto team_pol = Kokkos::TeamPolicy(num_teams, 1); + int team_size = 1; #endif - Kokkos::parallel_reduce(team_pol, tf, sum_view); - Kokkos::deep_copy(sum_scalar, sum_view); - ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams; + auto team_pol = Kokkos::TeamPolicy(num_teams, team_size); + Kokkos::parallel_reduce(team_pol, tf, sum_view); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams; + } - Kokkos::parallel_for( + // Test TeamThreadRange level reduction with 0 work produces 0 result + { + const int league_size = 1; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, 0, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 #ifdef KOKKOS_ENABLE_OPENMPTARGET - Kokkos::TeamPolicy(1, Kokkos::AUTO), + int team_size = + std::is_same::value + ? 32 + : 1; #else - Kokkos::TeamPolicy(1, 1), + int team_size = 1; #endif - KOKKOS_LAMBDA(member_type team_member) { - Scalar local_scalar; - Kokkos::Sum reducer_scalar( - local_scalar); - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, 0), f, - reducer_scalar); - sum_view() = local_scalar; - }); - Kokkos::deep_copy(sum_scalar, sum_view); - ASSERT_EQ(sum_scalar, Scalar{0}) << "N: " << N; - - auto team_size = std::min(128, TEST_EXECSPACE().concurrency()); - Kokkos::parallel_for( - Kokkos::TeamPolicy(10, team_size), - KOKKOS_LAMBDA(member_type team_member) { - Scalar local_scalar; - Kokkos::Sum reducer_scalar( - local_scalar); - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, N), f, - reducer_scalar); - sum_view() = local_scalar; - }); - Kokkos::deep_copy(sum_scalar, sum_view); - ASSERT_EQ(sum_scalar, reference_sum) << "N: " << N; + auto team_pol = Kokkos::TeamPolicy(1, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + ASSERT_EQ(result_h(0), Scalar{0}) << "N: " << N; + } + + // Same test as above, but with inner reduction over N, and league_size=10 + { + const int league_size = 10; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, N, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int initial_team_size = + std::is_same_v ? 32 + : 1; +#else + int initial_team_size = 1; +#endif + auto team_size_max = + Kokkos::TeamPolicy(league_size, initial_team_size) + .team_size_max(tnf, Kokkos::ParallelForTag()); + auto team_size = std::min(team_size_max, TEST_EXECSPACE().concurrency()); + auto team_pol = Kokkos::TeamPolicy(league_size, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + for (int i = 0; i < result_h.extent_int(0); ++i) { + ASSERT_EQ(result_h(i), reference_sum) << "N: " << N; + } + } } static void test_sum(int N) { From 5610068c5d908e94806ce8e9b7a8118ec959b1eb Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 22 Jan 2024 15:05:58 -0500 Subject: [PATCH 235/432] Don't touch my records! (refactor Cuda/HIP/SYCL/Threads to not directly mess with `SharedAllocationRecord`) (#6732) * Do not use SharedAllocationRecord directly * Purge Cuda/HIP graph implementation from SharedAllocationRecord * MemorySpace::{free -> allocate} and FIXMEs for size argument * Properly cast allocated pointer from void* to size_type* * Fixup Cuda/HIP graph header includes and pointer casting * Fix the FIXMEs --- core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 20 +--- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 99 +++++++++---------- core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 14 +-- core/src/HIP/Kokkos_HIP_Instance.cpp | 74 +++++++------- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 83 +++++++--------- core/src/Threads/Kokkos_Threads_Instance.cpp | 25 +---- 6 files changed, 124 insertions(+), 191 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index a4d064e544a..5a821ab64a3 100644 --- a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -23,8 +23,7 @@ #include -#include // GraphAccess needs to be complete -#include // SharedAllocationRecord +#include // GraphAccess needs to be complete #include #include @@ -50,10 +49,6 @@ class GraphNodeKernelImpl m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - // Note: owned pointer to CudaSpace memory (used for global memory launches), - // which we're responsible for deallocating, but not responsible for calling - // its destructor. - using Record = Kokkos::Impl::SharedAllocationRecord; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; @@ -82,9 +77,7 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - - auto* record = Record::allocate( - Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr) return m_driver_storage; } diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 0e34ccc67c9..6c60532705a 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -336,22 +336,19 @@ void CudaInternal::initialize(cudaStream_t stream) { Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = - Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream), - "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); KOKKOS_IMPL_CUDA_SAFE_CALL( (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size))); @@ -363,22 +360,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = - Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream), - "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -387,23 +381,20 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { if (verify_is_initialized("scratch_unified") && m_scratchUnifiedCount < scratch_count(size)) { - m_scratchUnifiedCount = scratch_count(size); + auto mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchUnified) { + mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + } - if (m_scratchUnified) - Record::decrement(Record::get_record(m_scratchUnified)); + m_scratchUnifiedCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain); - Record *const r = - Record::allocate(CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream), - "Kokkos::InternalScratchUnified", alloc_size); - - Record::increment(r); - - m_scratchUnified = reinterpret_cast(r->data()); + m_scratchUnified = static_cast( + mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size)); } return m_scratchUnified; @@ -411,21 +402,16 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = - Kokkos::Impl::SharedAllocationRecord; + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - if (m_scratchFunctor) - Record::decrement(Record::get_record(m_scratchFunctor)); + if (m_scratchFunctor) { + mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } - Record *const r = Record::allocate( - CudaSpace::impl_create(m_cudaDev, m_stream), - "Kokkos::InternalScratchFunctor", m_scratchFunctorSize); - - Record::increment(r); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast(r->data()); + m_scratchFunctor = static_cast(mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); } return m_scratchFunctor; @@ -480,15 +466,18 @@ void CudaInternal::finalize() { was_finalized = true; if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordCuda = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; - - RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags)); - RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace)); - RecordHost::decrement(RecordHost::get_record(m_scratchUnified)); - if (m_scratchFunctorSize > 0) - RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor)); + auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); + auto host_mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); + cuda_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + cuda_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + host_mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + if (m_scratchFunctorSize > 0) { + cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } } for (int i = 0; i < m_n_team_scratch; ++i) { diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 576c53426bc..434c62afc5f 100644 --- a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -26,7 +26,6 @@ #include #include -#include #include namespace Kokkos { @@ -43,7 +42,6 @@ class GraphNodeKernelImpl using base_t = typename PatternImplSpecializationFromTag::type; - using Record = Kokkos::Impl::SharedAllocationRecord; // TODO use the name and executionspace template @@ -60,7 +58,7 @@ class GraphNodeKernelImpl ~GraphNodeKernelImpl() { if (m_driver_storage) { - Record::decrement(Record::get_record(m_driver_storage)); + Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); } } @@ -78,15 +76,9 @@ class GraphNodeKernelImpl Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - - auto* record = Record::allocate( - Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; } diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index d441c59c212..3b5a1e0017c 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -191,20 +191,19 @@ void HIPInternal::initialize(hipStream_t stream) { Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -213,20 +212,19 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size)); } @@ -237,29 +235,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution( void const *driver, std::size_t const size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; + Kokkos::HIPHostPinnedSpace host_mem_space; if (m_scratchFunctor) { - Record::decrement(Record::get_record(m_scratchFunctor)); - RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } - Record *const r = - Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - RecordHost *const r_host = RecordHost::allocate( - Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); - - Record::increment(r); - RecordHost::increment(r_host); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast(r->data()); - m_scratchFunctorHost = reinterpret_cast(r_host->data()); + m_scratchFunctor = static_cast(device_mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); + m_scratchFunctorHost = static_cast(host_mem_space.allocate( + "Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize)); } // When using HSA_XNACK=1, it is necessary to copy the driver to the host to @@ -323,14 +312,17 @@ void HIPInternal::finalize() { was_finalized = true; if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordHIP = Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; - RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchSpaceCount * sizeScratchGrain); + device_mem_space.deallocate(m_scratchSpace, + m_scratchFlagsCount * sizeScratchGrain); if (m_scratchFunctorSize > 0) { - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + Kokkos::HIPHostPinnedSpace host_mem_space; + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } } diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 3f931c016ff..05b50d52534 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -196,13 +196,17 @@ void SYCLInternal::finalize() { #endif } - using RecordSYCL = Kokkos::Impl::SharedAllocationRecord; + auto device_mem_space = SYCLDeviceUSMSpace(*m_queue); + auto host_mem_space = SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); if (nullptr != m_scratchHost) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchHost)); + host_mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); if (nullptr != m_scratchFlags) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); m_syclDev = -1; m_scratchSpaceCount = 0; m_scratchSpace = nullptr; @@ -232,23 +236,18 @@ void SYCLInternal::finalize() { sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - Record::decrement(Record::get_record(m_scratchSpace)); + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -257,22 +256,18 @@ sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { - m_scratchHostCount = scratch_count(size); + auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLHostUSMSpace, void>; + if (nullptr != m_scratchHost) + mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); - if (m_scratchHost) Record::decrement(Record::get_record(m_scratchHost)); + m_scratchHostCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchHostCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLHostUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size); - - Record::increment(r); - - m_scratchHost = reinterpret_cast(r->data()); + m_scratchHost = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); } return m_scratchHost; @@ -281,23 +276,18 @@ sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) - Record::decrement(Record::get_record(m_scratchFlags)); + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); } auto memset_event = m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain); @@ -346,15 +336,12 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { assert(m_q); if (m_capacity < n) { - using Record = Kokkos::Impl::SharedAllocationRecord; - // First free what we have (in case malloc can reuse it) - if (m_data) Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + if (m_data) alloc_space.deallocate(m_data, m_capacity); - Record* const r = Record::allocate( - AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n); - Record::increment(r); + m_data = + alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); - m_data = r->data(); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); m_capacity = n; @@ -368,8 +355,8 @@ void SYCLInternal::USMObjectMem::reset() { if (m_data) { // This implies a fence since this class is not copyable // and deallocating implies a fence across all registered queues. - using Record = Kokkos::Impl::SharedAllocationRecord; - Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + alloc_space.deallocate(m_data, m_capacity); m_capacity = 0; m_data = nullptr; diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 9e7b4222aa3..49408b89916 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -172,14 +172,9 @@ ThreadsInternal::ThreadsInternal() ThreadsInternal::~ThreadsInternal() { const unsigned entry = m_pool_size - (m_pool_rank + 1); - using Record = Kokkos::Impl::SharedAllocationRecord; - if (m_scratch) { - Record *const r = Record::get_record(m_scratch); - + Kokkos::kokkos_free(m_scratch); m_scratch = nullptr; - - Record::decrement(r); } m_pool_base = nullptr; @@ -315,11 +310,8 @@ void ThreadsInternal::execute_resize_scratch_in_serial() { auto deallocate_scratch_memory = [](ThreadsInternal &exec) { if (exec.m_scratch) { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = Record::get_record(exec.m_scratch); - exec.m_scratch = nullptr; - Record::decrement(r); + Kokkos::kokkos_free(exec.m_scratch); + exec.m_scratch = nullptr; } }; if (s_threads_process.m_pool_base) { @@ -370,15 +362,8 @@ void ThreadsInternal::first_touch_allocate_thread_private_scratch( if (s_threads_process.m_scratch_thread_end) { // Allocate tracked memory: { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = - Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch", - s_threads_process.m_scratch_thread_end); - - Record::increment(r); - - exec.m_scratch = r->data(); + exec.m_scratch = Kokkos::kokkos_malloc( + "Kokkos::thread_scratch", s_threads_process.m_scratch_thread_end); } unsigned *ptr = reinterpret_cast(exec.m_scratch); From d18ad8f349634e23fe78db185114ebdc1e9674fb Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 22 Jan 2024 23:06:27 -0500 Subject: [PATCH 236/432] Untangle SharedAllocationRecord spaghetti code --- Makefile.targets | 2 - core/src/Cuda/Kokkos_CudaSpace.cpp | 174 +------------- core/src/Cuda/Kokkos_CudaSpace.hpp | 178 +------------- core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp | 1 - core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 1 - .../HIP/Kokkos_HIP_SharedAllocationRecord.cpp | 144 +----------- .../HIP/Kokkos_HIP_SharedAllocationRecord.hpp | 119 +--------- core/src/HIP/Kokkos_HIP_Space.cpp | 21 -- core/src/HIP/Kokkos_HIP_Space.hpp | 2 - core/src/HIP/Kokkos_HIP_UniqueToken.hpp | 1 - core/src/Kokkos_HBWSpace.hpp | 73 +----- core/src/Kokkos_HostSpace.hpp | 72 +----- core/src/OpenACC/Kokkos_OpenACCSpace.cpp | 1 - .../Kokkos_OpenACC_SharedAllocationRecord.cpp | 89 +------ .../Kokkos_OpenACC_SharedAllocationRecord.hpp | 52 +--- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp | 71 +----- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 63 +---- core/src/SYCL/Kokkos_SYCL_Space.cpp | 222 +----------------- core/src/SYCL/Kokkos_SYCL_Space.hpp | 151 +----------- core/src/impl/Kokkos_HBWSpace.cpp | 135 +---------- core/src/impl/Kokkos_HostSpace.cpp | 81 +------ core/src/impl/Kokkos_MemorySpace.cpp | 72 ------ core/src/impl/Kokkos_MemorySpace.hpp | 71 ------ core/src/impl/Kokkos_SharedAlloc.cpp | 50 ++++ core/src/impl/Kokkos_SharedAlloc.hpp | 164 ++++++++++++- core/src/impl/Kokkos_SharedAlloc_timpl.hpp | 131 +++++++++-- 26 files changed, 373 insertions(+), 1768 deletions(-) delete mode 100644 core/src/impl/Kokkos_MemorySpace.cpp delete mode 100644 core/src/impl/Kokkos_MemorySpace.hpp diff --git a/Makefile.targets b/Makefile.targets index 75155bdd25f..6db0f2c17cc 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -28,8 +28,6 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp -Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index 9cd074df4fd..9eccc9a7245 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -33,7 +33,6 @@ //#include #include -#include #include @@ -437,160 +436,6 @@ void CudaHostPinnedSpace::impl_deallocate( namespace Kokkos { namespace Impl { -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -//============================================================================== -// {{{1 - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -void SharedAllocationRecord::deep_copy_header_no_exec( - void *ptr, const void *header) { - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy(exec, ptr, header, - sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -// end SharedAllocationRecord destructors }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::Cuda &arg_exec_space, const Kokkos::CudaSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -// end SharedAllocationRecord constructors }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -619,19 +464,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaHostPinnedSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp index 4a220dd6450..b0e36f1a875 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -123,7 +123,6 @@ class CudaSpace { cudaStream_t m_stream; static constexpr const char* m_name = "Cuda"; - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> @@ -539,179 +538,10 @@ struct DeepCopy -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecord; - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = - HostInaccessibleSharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::CudaSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - // workaround for issue with NVCC and MSVC - // https://github.com/kokkos/kokkos/issues/5258 - deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header); - } - - SharedAllocationRecord( - const Kokkos::Cuda& exec_space, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::CudaSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - // helper function to work around MSVC+NVCC issue - // https://github.com/kokkos/kokkos/issues/5258 - static void deep_copy_header_no_exec(void*, const void*); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaUVMSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = SharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index abb747e39a1..94a428493f4 100644 --- a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -22,7 +22,6 @@ #include #include -#include namespace Kokkos { diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 434c62afc5f..5f0df72df17 100644 --- a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index ea599989e7a..ab24004f5fc 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -18,138 +18,14 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif -#include -#include #include - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - HIP exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIP& arg_exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via host pinned memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via managed memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos +#include +#include +#include + +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPManagedSpace); diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index e68bad97230..fbae5188344 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -18,120 +18,11 @@ #define KOKKOS_HIP_SHARED_ALLOCATION_RECORD_HPP #include +#include -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - using base_t = HostInaccessibleSharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec*/, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIP& exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPManagedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPManagedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace); #endif diff --git a/core/src/HIP/Kokkos_HIP_Space.cpp b/core/src/HIP/Kokkos_HIP_Space.cpp index 7f6aa0d8e82..e8bdfca66fe 100644 --- a/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/core/src/HIP/Kokkos_HIP_Space.cpp @@ -24,10 +24,8 @@ #include #include -#include #include -#include #include #include @@ -287,22 +285,3 @@ void HIPManagedSpace::impl_deallocate( } } // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include - -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index e7dc42a431f..28e5a1ccd50 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -92,8 +92,6 @@ class HIPSpace { private: int m_device; ///< Which HIP device - - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> diff --git a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp index 313e5f52172..3d70b596463 100644 --- a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp +++ b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -19,7 +19,6 @@ #include #include -#include namespace Kokkos { diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp index f7775ba2964..4400bb77606 100644 --- a/core/src/Kokkos_HBWSpace.hpp +++ b/core/src/Kokkos_HBWSpace.hpp @@ -26,6 +26,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_HBWSPACE #include +#include namespace Kokkos { @@ -99,8 +100,6 @@ class HBWSpace { private: AllocationMechanism m_alloc_mech; - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::HBWSpace, void>; }; } // namespace Experimental @@ -109,75 +108,7 @@ class HBWSpace { //---------------------------------------------------------------------------- -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecord { - private: - friend Kokkos::Experimental::HBWSpace; - - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase*); - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HBWSpace instance */ - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::HBWSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, - const Kokkos::Experimental::HBWSpace&, - bool detail = false); -}; - -} // namespace Impl - -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::Experimental::HBWSpace); //---------------------------------------------------------------------------- diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 7c84d8afab1..82adb29f2fc 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -37,7 +37,6 @@ static_assert(false, #include #include "impl/Kokkos_HostSpace_deepcopy.hpp" -#include /*--------------------------------------------------------------------------*/ @@ -121,7 +120,6 @@ class HostSpace { private: static constexpr const char* m_name = "Host"; - friend class Kokkos::Impl::SharedAllocationRecord; }; } // namespace Kokkos @@ -169,75 +167,7 @@ struct HostMirror { //---------------------------------------------------------------------------- -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend Kokkos::HostSpace; - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HostSpace instance */ - static RecordBase s_root_record; -#endif - - Kokkos::HostSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /* exec_space*/, const Kokkos::HostSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } -}; - -} // namespace Impl - -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HostSpace); //---------------------------------------------------------------------------- diff --git a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index 372d8069c99..acc0dcd3c6e 100644 --- a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp index 91faa64f733..76e1514476a 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp +++ b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp @@ -16,92 +16,11 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE -#include +#include #include -#include -#include - -#ifdef KOKKOS_ENABLE_DEBUG -Kokkos::Impl::SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenACCSpace, void>::s_root_record; -#endif - -Kokkos::Impl::SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC &arg_exec_space, - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -//============================================================================== -// {{{1 +#include #include -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicitly instantiate these CRTP base classes -// here, where we have access to the associated *_timpl.hpp header files. -template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; -template class Kokkos::Impl::SharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenACCSpace); diff --git a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp index cf83a5b27bc..cde5ecdcb77 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp @@ -20,55 +20,7 @@ #include #include -#include - -template <> -class Kokkos::Impl::SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - friend class SharedAllocationRecordCommon; - friend Kokkos::Experimental::OpenACCSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenACCSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenACCSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC& exec_space, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); -}; +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenACCSpace); #endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index 81fbc56de00..a414b34d7c6 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -37,7 +37,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -110,79 +109,13 @@ void OpenMPTargetSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // TODO DeepCopy - // DeepCopy - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenMPTargetSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index e5b33d0982f..98ff7b18d0e 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -121,9 +121,6 @@ class OpenMPTargetSpace { const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = Kokkos::Tools::make_space_handle(name())) const; - - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>; }; } // namespace Experimental } // namespace Kokkos @@ -131,64 +128,8 @@ class OpenMPTargetSpace { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend Kokkos::Experimental::OpenMPTargetSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenMPTargetSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenMPTargetSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc) { - KOKKOS_IF_ON_HOST( - (return new SharedAllocationRecord(arg_space, arg_label, arg_alloc);)) - KOKKOS_IF_ON_DEVICE( - ((void)arg_space; (void)arg_label; (void)arg_alloc; return nullptr;)) - } -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenMPTargetSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/SYCL/Kokkos_SYCL_Space.cpp b/core/src/SYCL/Kokkos_SYCL_Space.cpp index 64b7f56796a..9cc8008cdf3 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include /*--------------------------------------------------------------------------*/ @@ -243,226 +242,17 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record; -#endif - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(space, label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Experimental::SYCL exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& arg_exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_exec_space, space, - label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/core/src/SYCL/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp index df921df5801..f7b801f8463 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -292,151 +292,14 @@ struct MemorySpaceAccess< } // namespace Impl -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::SYCLDeviceUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLSharedUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLHostUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl - } // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLHostUSMSpace); + #endif #endif diff --git a/core/src/impl/Kokkos_HBWSpace.cpp b/core/src/impl/Kokkos_HBWSpace.cpp index cd640b88cb9..947bef1253b 100644 --- a/core/src/impl/Kokkos_HBWSpace.cpp +++ b/core/src/impl/Kokkos_HBWSpace.cpp @@ -32,7 +32,6 @@ #include #include -#include #include #ifdef KOKKOS_ENABLE_HBWSPACE #include @@ -177,137 +176,9 @@ void HBWSpace::impl_deallocate( //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - -//---------------------------------------------------------------------------- - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); +#include - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = - SharedAllocationRecord; - - SharedAllocationHeader const *const head = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - RecordHost *const record = - head ? static_cast(head->m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream &s, const Kokkos::Experimental::HBWSpace &space, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "HBWSpace", &s_root_record, detail); -#else - throw_runtime_exception( - "SharedAllocationRecord::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::HBWSpace); #endif diff --git a/core/src/impl/Kokkos_HostSpace.cpp b/core/src/impl/Kokkos_HostSpace.cpp index a9d72160593..6064a595f9c 100644 --- a/core/src/impl/Kokkos_HostSpace.cpp +++ b/core/src/impl/Kokkos_HostSpace.cpp @@ -21,7 +21,6 @@ #include #include -#include #include /*--------------------------------------------------------------------------*/ @@ -150,84 +149,6 @@ void HostSpace::impl_deallocate( } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationHeader *_do_allocation(Kokkos::HostSpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast( - space.allocate(alloc_size)); - } catch (Experimental::RawMemoryAllocationFailure const &failure) { - if (failure.failure_mode() == Experimental::RawMemoryAllocationFailure:: - FailureMode::AllocationNotAligned) { - // TODO: delete the misaligned memory - } - - std::cerr << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space.name() - << " failed with the following error: "; - failure.print_error_message(std::cerr); - std::cerr.flush(); - Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); - } - return nullptr; // unreachable -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::HostSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//============================================================================== -// {{{1 - #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HostSpace); diff --git a/core/src/impl/Kokkos_MemorySpace.cpp b/core/src/impl/Kokkos_MemorySpace.cpp deleted file mode 100644 index 2f0e01c5b28..00000000000 --- a/core/src/impl/Kokkos_MemorySpace.cpp +++ /dev/null @@ -1,72 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.cpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -void safe_throw_allocation_with_header_failure( - std::string const& space_name, std::string const& label, - Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - auto generate_failure_message = [&](std::ostream& o) { - o << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space_name - << "\" failed with the following error: "; - failure.print_error_message(o); - if (failure.failure_mode() == - Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned) { - // TODO: delete the misaligned memory? - o << "Warning: Allocation failed due to misalignment; memory may " - "be leaked.\n"; - } - o.flush(); - }; - try { - std::ostringstream sstr; - generate_failure_message(sstr); - Kokkos::Impl::throw_runtime_exception(sstr.str()); - } catch (std::bad_alloc const&) { - // Probably failed to allocate the string because we're so close to out - // of memory. Try printing to std::cerr instead - try { - generate_failure_message(std::cerr); - } catch (std::bad_alloc const&) { - // oh well, we tried... - } - Kokkos::Impl::throw_runtime_exception( - "Kokkos encountered an allocation failure, then another allocation " - "failure while trying to create the error message."); - } -} - -} // end namespace Impl -} // end namespace Kokkos diff --git a/core/src/impl/Kokkos_MemorySpace.hpp b/core/src/impl/Kokkos_MemorySpace.hpp deleted file mode 100644 index 44956dd7c5d..00000000000 --- a/core/src/impl/Kokkos_MemorySpace.hpp +++ /dev/null @@ -1,71 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.hpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_MEMORYSPACE_HPP -#define KOKKOS_IMPL_MEMORYSPACE_HPP - -#include -#include -#include - -#include - -namespace Kokkos { -namespace Impl { - -// Defined in implementation file to avoid having to include iostream -void safe_throw_allocation_with_header_failure( - std::string const &space_name, std::string const &label, - Kokkos::Experimental::RawMemoryAllocationFailure const &failure); - -template -SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -template -SharedAllocationHeader *checked_allocation_with_header( - ExecutionSpace const &exec_space, MemorySpace const &space, - std::string const &label, size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_MEMORYSPACE_HPP diff --git a/core/src/impl/Kokkos_SharedAlloc.cpp b/core/src/impl/Kokkos_SharedAlloc.cpp index 255f5125f4a..0bc3814b3a1 100644 --- a/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/core/src/impl/Kokkos_SharedAlloc.cpp @@ -20,6 +20,8 @@ #include #include +#include +#include namespace Kokkos { namespace Impl { @@ -321,5 +323,53 @@ void SharedAllocationRecord::print_host_accessible_records( } #endif +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + auto generate_failure_message = [&](std::ostream& o) { + o << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space_name + << "\" failed with the following error: "; + failure.print_error_message(o); + if (failure.failure_mode() == + Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: + AllocationNotAligned) { + // TODO: delete the misaligned memory? + o << "Warning: Allocation failed due to misalignment; memory may " + "be leaked.\n"; + } + o.flush(); + }; + try { + std::ostringstream sstr; + generate_failure_message(sstr); + Kokkos::Impl::throw_runtime_exception(sstr.str()); + } catch (std::bad_alloc const&) { + // Probably failed to allocate the string because we're so close to out + // of memory. Try printing to std::cerr instead + try { + generate_failure_message(std::cerr); + } catch (std::bad_alloc const&) { + // oh well, we tried... + } + Kokkos::Impl::throw_runtime_exception( + "Kokkos encountered an allocation failure, then another allocation " + "failure while trying to create the error message."); + } +} + +void fill_host_accessible_header_info( + SharedAllocationRecord* arg_record, + SharedAllocationHeader& arg_header, std::string const& arg_label) { + // Fill in the Header information, directly accessible on the host + + arg_header.m_record = arg_record; + + strncpy(arg_header.m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length); + // Set last element zero, in case c_str is too long + arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; +} + } /* namespace Impl */ } /* namespace Kokkos */ diff --git a/core/src/impl/Kokkos_SharedAlloc.hpp b/core/src/impl/Kokkos_SharedAlloc.hpp index 043505a158e..9a9c2653de3 100644 --- a/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/core/src/impl/Kokkos_SharedAlloc.hpp @@ -51,6 +51,9 @@ class SharedAllocationHeader { friend class SharedAllocationRecordCommon; template friend class HostInaccessibleSharedAllocationRecordCommon; + friend void fill_host_accessible_header_info( + SharedAllocationRecord*, SharedAllocationHeader&, + std::string const&); Record* m_record; char m_label[maximum_label_length]; @@ -145,25 +148,23 @@ class SharedAllocationRecord { SharedAllocationRecord() : m_alloc_ptr(nullptr), m_alloc_size(0), - m_dealloc(nullptr) + m_dealloc(nullptr), #ifdef KOKKOS_ENABLE_DEBUG - , m_root(this), m_prev(this), - m_next(this) + m_next(this), #endif - , m_count(0) { } static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION const SharedAllocationHeader* head() const { return m_alloc_ptr; } /* User's memory begins at the end of the header */ - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION void* data() const { return static_cast(m_alloc_ptr + 1); } /* User's memory begins at the end of the header */ @@ -195,23 +196,79 @@ class SharedAllocationRecord { const SharedAllocationRecord* const root, const bool detail); }; +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure); + +template +SharedAllocationHeader* checked_allocation_with_header(MemorySpace const& space, + std::string const& label, + size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +template +SharedAllocationHeader* checked_allocation_with_header( + ExecutionSpace const& exec_space, MemorySpace const& space, + std::string const& label, size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +void fill_host_accessible_header_info(SharedAllocationHeader& arg_header, + std::string const& arg_label); + template class SharedAllocationRecordCommon : public SharedAllocationRecord { private: using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; - derived_t& self() { return *static_cast(this); } - derived_t const& self() const { return *static_cast(this); } protected: using record_base_t::record_base_t; - void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label); + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif static void deallocate(record_base_t* arg_rec); public: + ~SharedAllocationRecordCommon(); + template + SharedAllocationRecordCommon( + ExecutionSpace const&, MemorySpace const& space, std::string const& label, + std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); + } + SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + static auto allocate(MemorySpace const& arg_space, std::string const& arg_label, size_t arg_alloc_size) -> derived_t*; @@ -231,22 +288,103 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord { template class HostInaccessibleSharedAllocationRecordCommon - : public SharedAllocationRecordCommon { + : public SharedAllocationRecord { private: - using base_t = SharedAllocationRecordCommon; using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; protected: - using base_t::base_t; + using record_base_t::record_base_t; + + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif + + static void deallocate(record_base_t* arg_rec); public: + ~HostInaccessibleSharedAllocationRecordCommon(); + template + HostInaccessibleSharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + } + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + + static auto allocate(MemorySpace const& arg_space, + std::string const& arg_label, size_t arg_alloc_size) + -> derived_t*; + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(MemorySpace const& arg_space, + std::string const& arg_alloc_label, + size_t arg_alloc_size); + /**\brief Reallocate tracked memory in the space */ + static void deallocate_tracked(void* arg_alloc_ptr); + /**\brief Deallocate tracked memory in the space */ + static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size); + static void print_records(std::ostream& s, MemorySpace const&, bool detail = false); static auto get_record(void* alloc_ptr) -> derived_t*; std::string get_label() const; }; +#ifdef KOKKOS_ENABLE_DEBUG +template +SharedAllocationRecord + SharedAllocationRecordCommon::s_root_record; + +template +SharedAllocationRecord + HostInaccessibleSharedAllocationRecordCommon::s_root_record; +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::SharedAllocationRecordCommon { \ + using SharedAllocationRecordCommon< \ + MEMORY_SPACE>::SharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( \ + MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> { \ + using HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE>::HostInaccessibleSharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::SharedAllocationRecordCommon + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> + namespace { /* Taking the address of this function so make sure it is unique */ diff --git a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp index d403ef9db06..c903180dea5 100644 --- a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp +++ b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -31,6 +31,66 @@ namespace Kokkos { namespace Impl { +template +SharedAllocationRecordCommon::~SharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} +template +HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::~HostInaccessibleSharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} + +template +SharedAllocationRecordCommon::SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); +} + +template +HostInaccessibleSharedAllocationRecordCommon:: + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, + std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + typename MemorySpace::execution_space exec; + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + exec.fence(std::string("SharedAllocationRecord::SharedAllocationRecord(): " + "fence after copying header from HostSpace"); +} + template auto SharedAllocationRecordCommon::allocate( MemorySpace const& arg_space, std::string const& arg_label, @@ -86,6 +146,63 @@ void* SharedAllocationRecordCommon::reallocate_tracked( return r_new->data(); } +template +auto HostInaccessibleSharedAllocationRecordCommon::allocate( + MemorySpace const& arg_space, std::string const& arg_label, + size_t arg_alloc_size) -> derived_t* { + return new derived_t(arg_space, arg_label, arg_alloc_size); +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::allocate_tracked(const MemorySpace& arg_space, + const std::string& arg_alloc_label, + size_t arg_alloc_size) { + if (!arg_alloc_size) return nullptr; + + SharedAllocationRecord* const r = + allocate(arg_space, arg_alloc_label, arg_alloc_size); + + record_base_t::increment(r); + + return r->data(); +} + +template +void HostInaccessibleSharedAllocationRecordCommon::deallocate( + HostInaccessibleSharedAllocationRecordCommon::record_base_t* arg_rec) { + delete static_cast(arg_rec); +} + +template +void HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::deallocate_tracked(void* arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr); + record_base_t::decrement(r); + } +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::reallocate_tracked(void* arg_alloc_ptr, + size_t arg_alloc_size) { + derived_t* const r_old = derived_t::get_record(arg_alloc_ptr); + derived_t* const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + Kokkos::fence( + "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); + + record_base_t::increment(r_new); + record_base_t::decrement(r_old); + + return r_new->data(); +} + template auto SharedAllocationRecordCommon::get_record(void* alloc_ptr) -> derived_t* { @@ -108,20 +225,6 @@ std::string SharedAllocationRecordCommon::get_label() const { return record_base_t::m_label; } -template -void SharedAllocationRecordCommon:: - _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label) { - // Fill in the Header information, directly accessible on the host - - arg_header.m_record = &self(); - - strncpy(arg_header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - template void SharedAllocationRecordCommon::print_records( std::ostream& s, const MemorySpace&, bool detail) { From c17969f3338f9b5866496e31f34796703038d85a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 24 Jan 2024 09:48:29 -0500 Subject: [PATCH 237/432] Trilinos: Don't let Kokkos set CMAKE_CXX_FLAGS --- CMakeLists.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6d3ab4e29d..3b2168ff9ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,7 +252,6 @@ ENDIF() # subpackages ## This restores the old behavior of ProjectCompilerPostConfig.cmake -# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos # We must do this before KOKKOS_PACKAGE_DECL IF (KOKKOS_HAS_TRILINOS) # Overwrite the old flags at the top-level @@ -280,21 +279,13 @@ IF (KOKKOS_HAS_TRILINOS) SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}") IF (KOKKOS_ENABLE_CUDA) STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}") ENDIF() - # Both parent scope and this package - # In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in - # TRILINOS_TOPLEVEL_CXX_FLAGS - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE) - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}") - #CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here #These flags get set up in KOKKOS_PACKAGE_DECL, which means they #must be configured before KOKKOS_PACKAGE_DECL SET(KOKKOS_ALL_COMPILE_OPTIONS From a1a6ea14c181b0e4f88cec82652bf594ceba5e22 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 24 Jan 2024 08:14:54 -0700 Subject: [PATCH 238/432] Fix TestThreadVectorMDRangeParallelReduce (#6734) * Fix TestThreadVectorMDRangeParallelReduce * Eliminate teamSum --- core/unit_test/TestTeamMDRange.hpp | 51 ++++++++++++------------------ 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/core/unit_test/TestTeamMDRange.hpp b/core/unit_test/TestTeamMDRange.hpp index 7948dd8b1a5..81931467c5a 100644 --- a/core/unit_test/TestTeamMDRange.hpp +++ b/core/unit_test/TestTeamMDRange.hpp @@ -1416,15 +1416,14 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { #endif ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1432,11 +1431,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k); }, threadSum); - - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1479,15 +1476,14 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { #endif ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1497,10 +1493,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1544,15 +1539,14 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { #endif ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1562,10 +1556,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1614,15 +1607,14 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { #endif ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1632,10 +1624,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1688,15 +1679,14 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { #endif ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5, n6); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1706,10 +1696,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); From 2dc7cbcc9cebe83569a644ccf3a7c6e825e6ec4a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 24 Jan 2024 14:45:18 -0700 Subject: [PATCH 239/432] Cuda multi-GPU support: Allow execution space instance constructor to run (#6706) * Cuda multi-GPU support: Allow execution space instance constructor to run * Skip a test * Use cuda_stream/device also for UVM and HostPinned * Clean up * Revert test changes --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 7 +-- core/unit_test/CMakeLists.txt | 6 +++ .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 49 +++++++++++++++++++ 3 files changed, 56 insertions(+), 6 deletions(-) create mode 100644 core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 6c60532705a..1d61c9c5906 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -292,11 +292,7 @@ void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev))); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); - // FIXME_CUDA multiple devices - if (m_cudaDev != Cuda().cuda_device()) - Kokkos::abort( - "Currently, the device id must match the device id used when Kokkos " - "was initialized!"); + m_stream = stream; //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -317,7 +313,6 @@ void CudaInternal::initialize(cudaStream_t stream) { (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - m_stream = stream; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 7bbf72c2533..45e002dd63f 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -782,6 +782,12 @@ if(Kokkos_ENABLE_CUDA) UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_CudaInterOpStreamsMultiGPU + SOURCES + UnitTestMain.cpp + cuda/TestCuda_InterOp_StreamsMultiGPU.cpp + ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_CudaGraph SOURCES diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 00000000000..8a8270c7f93 --- /dev/null +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,49 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { +TEST(cuda, multi_gpu) { + Kokkos::initialize(); + + int n_devices; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices)); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(0)); + cudaStream_t stream0; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream0)); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(n_devices - 1)); + cudaStream_t stream; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream)); + + { + TEST_EXECSPACE space0(stream0); + ASSERT_EQ(space0.cuda_device(), 0); + TEST_EXECSPACE space(stream); + ASSERT_EQ(space.cuda_device(), n_devices - 1); + } + Kokkos::finalize(); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream0)); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(n_devices - 1)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream)); +} +} // namespace From 57126af31360a14cf1b5e2948a3f5fbf48a861b2 Mon Sep 17 00:00:00 2001 From: tcclevenger Date: Wed, 24 Jan 2024 16:16:01 -0700 Subject: [PATCH 240/432] add more warp sync for cuda reductions --- core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 5 +++++ core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index d1b0dbb815f..fa804556d6e 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -308,6 +308,11 @@ class ParallelReduce Date: Thu, 25 Jan 2024 10:20:04 -0500 Subject: [PATCH 241/432] Drop support for deprecated command-line arguments and environment variables (#6744) * Drop support for deprecated cmdline args and env vars * Deprecate num_devices and skip_device in InitializationSettings * Fixup rm now unused get_flag utility lambda --- core/src/impl/Kokkos_Core.cpp | 160 +----------------- core/src/impl/Kokkos_DeviceManagement.hpp | 3 +- .../impl/Kokkos_InitializationSettings.hpp | 24 ++- core/unit_test/TestInitializationSettings.cpp | 4 - .../TestParseCmdLineArgsAndEnvVars.cpp | 66 ++------ 5 files changed, 49 insertions(+), 208 deletions(-) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index bcb794b11fe..25217786471 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -90,8 +90,6 @@ void combine(Kokkos::InitializationSettings& out, KOKKOS_IMPL_COMBINE_SETTING(num_threads); KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by); KOKKOS_IMPL_COMBINE_SETTING(device_id); - KOKKOS_IMPL_COMBINE_SETTING(num_devices); - KOKKOS_IMPL_COMBINE_SETTING(skip_device); KOKKOS_IMPL_COMBINE_SETTING(disable_warnings); KOKKOS_IMPL_COMBINE_SETTING(tune_internals); KOKKOS_IMPL_COMBINE_SETTING(tools_help); @@ -317,8 +315,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { return std::stoi(id.c_str()); } -std::vector Kokkos::Impl::get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count) { +std::vector Kokkos::Impl::get_visible_devices(int device_count) { std::vector visible_devices; char* env_visible_devices = std::getenv("KOKKOS_VISIBLE_DEVICES"); if (env_visible_devices) { @@ -345,30 +342,9 @@ std::vector Kokkos::Impl::get_visible_devices( } } } else { - int num_devices = - settings.has_num_devices() ? settings.get_num_devices() : device_count; - if (num_devices > device_count) { - std::stringstream ss; - ss << "Error: Specified number of devices '" << num_devices - << "' exceeds the actual number of GPUs available for execution '" - << device_count << "'." - << " Raised by Kokkos::initialize().\n"; - Kokkos::abort(ss.str().c_str()); - } - for (int i = 0; i < num_devices; ++i) { + for (int i = 0; i < device_count; ++i) { visible_devices.push_back(i); } - if (settings.has_skip_device()) { - if (visible_devices.size() == 1 && settings.get_skip_device() == 0) { - Kokkos::abort( - "Error: skipping the only GPU available for execution.\n" - " Raised by Kokkos::initialize().\n"); - } - visible_devices.erase( - std::remove(visible_devices.begin(), visible_devices.end(), - settings.get_skip_device()), - visible_devices.end()); - } } if (visible_devices.empty()) { Kokkos::abort( @@ -379,9 +355,8 @@ std::vector Kokkos::Impl::get_visible_devices( } int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { - std::vector visible_devices = - get_visible_devices(settings, get_device_count()); - int const num_devices = visible_devices.size(); + std::vector visible_devices = get_visible_devices(get_device_count()); + int const num_devices = visible_devices.size(); // device_id is provided if (settings.has_device_id()) { int const id = settings.get_device_id(); @@ -920,36 +895,18 @@ void Kokkos::Impl::parse_command_line_arguments( int num_threads; int device_id; - int num_devices; // deprecated - int skip_device; // deprecated std::string map_device_id_by; bool disable_warnings; bool print_configuration; bool tune_internals; - auto get_flag = [](std::string s) -> std::string { - return s.erase(s.find('=')); - }; - bool help_flag = false; int iarg = 0; while (iarg < argc) { bool remove_flag = false; - if (check_arg(argv[iarg], "--kokkos-numa") || - check_arg(argv[iarg], "--numa")) { - warn_deprecated_command_line_argument(get_flag(argv[iarg])); - // remove flag if prefixed with '--kokkos-' - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads) || - check_arg_int(argv[iarg], "--num-threads", num_threads) || - check_arg_int(argv[iarg], "--kokkos-threads", num_threads) || - check_arg_int(argv[iarg], "--threads", num_threads)) { - if (get_flag(argv[iarg]) != "--kokkos-num-threads") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-num-threads"); - } + if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads)) { if (!is_valid_num_threads(num_threads)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -958,15 +915,8 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_num_threads(num_threads); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id) || - check_arg_int(argv[iarg], "--device-id", device_id) || - check_arg_int(argv[iarg], "--kokkos-device", device_id) || - check_arg_int(argv[iarg], "--device", device_id)) { - if (get_flag(argv[iarg]) != "--kokkos-device-id") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-device-id"); - } + remove_flag = true; + } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id)) { if (!is_valid_device_id(device_id)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -975,70 +925,7 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_device_id(device_id); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices") || - check_arg(argv[iarg], "--ndevices")) { - if (check_arg(argv[iarg], "--num-devices")) { - warn_deprecated_command_line_argument("--num-devices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--ndevices")) { - warn_deprecated_command_line_argument("--ndevices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--kokkos-ndevices")) { - warn_deprecated_command_line_argument("--kokkos-ndevices", - "--kokkos-num-devices"); - } - warn_deprecated_command_line_argument( - "--kokkos-num-devices", "--kokkos-map-device-id-by=mpi_rank"); - // Find the number of device (expecting --device=XX) - if (!((strncmp(argv[iarg], "--kokkos-num-devices=", 21) == 0) || - (strncmp(argv[iarg], "--num-devices=", 14) == 0) || - (strncmp(argv[iarg], "--kokkos-ndevices=", 18) == 0) || - (strncmp(argv[iarg], "--ndevices=", 11) == 0))) - throw_runtime_exception( - "Error: expecting an '=INT[,INT]' after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - - char* num1 = strchr(argv[iarg], '=') + 1; - char* num2 = strpbrk(num1, ","); - int num1_len = num2 == nullptr ? strlen(num1) : num2 - num1; - char* num1_only = new char[num1_len + 1]; - strncpy(num1_only, num1, num1_len); - num1_only[num1_len] = '\0'; - - if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) { - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - } - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - num_devices = std::stoi(num1_only); - settings.set_num_devices(num_devices); - settings.set_map_device_id_by("mpi_rank"); - } - delete[] num1_only; - - if (num2 != nullptr) { - if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1)) - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices=XX,'." - " Raised by Kokkos::initialize()."); - - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - skip_device = std::stoi(num2 + 1); - settings.set_skip_device(skip_device); - } - } - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; + remove_flag = true; } else if (check_arg_bool(argv[iarg], "--kokkos-disable-warnings", disable_warnings)) { settings.set_disable_warnings(disable_warnings); @@ -1107,9 +994,6 @@ void Kokkos::Impl::parse_environment_variables( } combine(settings, tools_init_arguments); - if (std::getenv("KOKKOS_NUMA")) { - warn_deprecated_environment_variable("KOKKOS_NUMA"); - } int num_threads; if (check_env_int("KOKKOS_NUM_THREADS", num_threads)) { if (!is_valid_num_threads(num_threads)) { @@ -1134,34 +1018,6 @@ void Kokkos::Impl::parse_environment_variables( } settings.set_device_id(device_id); } - int num_devices; - int rand_devices; - bool has_num_devices = check_env_int("KOKKOS_NUM_DEVICES", num_devices); - bool has_rand_devices = check_env_int("KOKKOS_RAND_DEVICES", rand_devices); - if (has_rand_devices && has_num_devices) { - Impl::throw_runtime_exception( - "Error: cannot specify both KOKKOS_NUM_DEVICES and " - "KOKKOS_RAND_DEVICES." - " Raised by Kokkos::initialize()."); - } - if (has_num_devices) { - warn_deprecated_environment_variable("KOKKOS_NUM_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=mpi_rank"); - settings.set_map_device_id_by("mpi_rank"); - settings.set_num_devices(num_devices); - } - if (has_rand_devices) { - warn_deprecated_environment_variable("KOKKOS_RAND_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=random"); - settings.set_map_device_id_by("random"); - settings.set_num_devices(rand_devices); - } - if (has_num_devices || has_rand_devices) { - int skip_device; - if (check_env_int("KOKKOS_SKIP_DEVICE", skip_device)) { - settings.set_skip_device(skip_device); - } - } bool disable_warnings; if (check_env_bool("KOKKOS_DISABLE_WARNINGS", disable_warnings)) { settings.set_disable_warnings(disable_warnings); diff --git a/core/src/impl/Kokkos_DeviceManagement.hpp b/core/src/impl/Kokkos_DeviceManagement.hpp index bd89c8b19ca..a8ec208c948 100644 --- a/core/src/impl/Kokkos_DeviceManagement.hpp +++ b/core/src/impl/Kokkos_DeviceManagement.hpp @@ -26,8 +26,7 @@ int get_gpu(const Kokkos::InitializationSettings& settings); // This declaration is provided for testing purposes only int get_ctest_gpu(int local_rank); // ditto -std::vector get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count); +std::vector get_visible_devices(int device_count); } // namespace Impl } // namespace Kokkos diff --git a/core/src/impl/Kokkos_InitializationSettings.hpp b/core/src/impl/Kokkos_InitializationSettings.hpp index d5732f284bc..11a93c6bb56 100644 --- a/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/core/src/impl/Kokkos_InitializationSettings.hpp @@ -38,12 +38,32 @@ class InitializationSettings { TYPE get_##NAME() const noexcept { return *m_##NAME; } \ static_assert(true, "no-op to require trailing semicolon") +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + private: \ + std::optional m_##NAME; \ + \ + public: \ + KOKKOS_DEPRECATED InitializationSettings& set_##NAME(TYPE NAME) { \ + m_##NAME = NAME; \ + return *this; \ + } \ + KOKKOS_DEPRECATED bool has_##NAME() const noexcept { \ + return static_cast(m_##NAME); \ + } \ + KOKKOS_DEPRECATED TYPE get_##NAME() const noexcept { return *m_##NAME; } \ + static_assert(true, "no-op to require trailing semicolon") +#else +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + static_assert(true, "no-op to require trailing semicolon") +#endif + public: KOKKOS_IMPL_DECLARE(int, num_threads); KOKKOS_IMPL_DECLARE(int, device_id); KOKKOS_IMPL_DECLARE(std::string, map_device_id_by); - KOKKOS_IMPL_DECLARE(int, num_devices); // deprecated - KOKKOS_IMPL_DECLARE(int, skip_device); // deprecated + KOKKOS_IMPL_DECLARE_DEPRECATED(int, num_devices); + KOKKOS_IMPL_DECLARE_DEPRECATED(int, skip_device); KOKKOS_IMPL_DECLARE(bool, disable_warnings); KOKKOS_IMPL_DECLARE(bool, print_configuration); KOKKOS_IMPL_DECLARE(bool, tune_internals); diff --git a/core/unit_test/TestInitializationSettings.cpp b/core/unit_test/TestInitializationSettings.cpp index 08eddc79e5d..40dc3f11df3 100644 --- a/core/unit_test/TestInitializationSettings.cpp +++ b/core/unit_test/TestInitializationSettings.cpp @@ -28,8 +28,6 @@ TEST(defaultdevicetype, initialization_settings) { EXPECT_TRUE(settings.has_num_threads()); EXPECT_EQ(settings.get_num_threads(), 255); EXPECT_FALSE(settings.has_device_id()); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); EXPECT_TRUE(settings.has_disable_warnings()); EXPECT_FALSE(settings.get_disable_warnings()); EXPECT_FALSE(settings.has_tune_internals()); @@ -51,8 +49,6 @@ constexpr bool test_initialization_settings_getter() { TYPE>::value); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_threads, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(device_id, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_devices, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(skip_device, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(disable_warnings, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tune_internals, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_help, bool); diff --git a/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp b/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp index 176ce9b5fed..a56dfd9efc7 100644 --- a/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp +++ b/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp @@ -166,22 +166,6 @@ TEST(defaultdevicetype, cmd_line_args_device_id) { EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--dummy"}); } -TEST(defaultdevicetype, cmd_line_args_num_devices) { - CmdLineArgsHelper cla = {{ - "--kokkos-num-devices=5,6", - "--kokkos-num-devices=7", - "-v", - }}; - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 7); - // this is the current behavior, not suggesting this cannot be revisited - EXPECT_TRUE(settings.has_skip_device()) << "behavior changed see comment"; - EXPECT_EQ(settings.get_skip_device(), 6) << "behavior changed see comment"; - EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-v"}); -} - TEST(defaultdevicetype, cmd_line_args_disable_warning) { CmdLineArgsHelper cla = {{ "--kokkos-disable-warnings=1", @@ -351,20 +335,6 @@ TEST(defaultdevicetype, env_vars_device_id) { EXPECT_EQ(settings.get_device_id(), 33); } -TEST(defaultdevicetype, env_vars_num_devices) { - EnvVarsHelper ev = {{ - {"KOKKOS_NUM_DEVICES", "4"}, - {"KOKKOS_SKIP_DEVICE", "1"}, - }}; - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_environment_variables(settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 4); - EXPECT_TRUE(settings.has_skip_device()); - EXPECT_EQ(settings.get_skip_device(), 1); -} - TEST(defaultdevicetype, env_vars_disable_warnings) { for (auto const& value_true : {"1", "true", "TRUE", "yEs"}) { EnvVarsHelper ev = {{ @@ -420,22 +390,20 @@ TEST(defaultdevicetype, env_vars_tune_internals) { } TEST(defaultdevicetype, visible_devices) { -#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ - do { \ - EnvVarsHelper ev{ENV}; \ - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ - Kokkos::InitializationSettings settings; \ - Kokkos::Impl::parse_environment_variables(settings); \ - auto computed = Kokkos::Impl::get_visible_devices(settings, CNT); \ - std::vector expected = DEV; \ - EXPECT_EQ(expected.size(), computed.size()) \ - << ev << "device count: " << CNT; \ - auto n = std::min(expected.size(), computed.size()); \ - for (int i = 0; i < n; ++i) { \ - EXPECT_EQ(expected[i], computed[i]) \ - << "devices differ at index " << i << '\n' \ - << ev << "device count: " << CNT; \ - } \ +#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ + do { \ + EnvVarsHelper ev{ENV}; \ + SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ + auto computed = Kokkos::Impl::get_visible_devices(CNT); \ + std::vector expected = DEV; \ + EXPECT_EQ(expected.size(), computed.size()) \ + << ev << "device count: " << CNT; \ + auto n = std::min(expected.size(), computed.size()); \ + for (int i = 0; i < n; ++i) { \ + EXPECT_EQ(expected[i], computed[i]) \ + << "devices differ at index " << i << '\n' \ + << ev << "device count: " << CNT; \ + } \ } while (false) #define DEV(...) \ @@ -444,6 +412,8 @@ TEST(defaultdevicetype, visible_devices) { // first test with all environment variables that are involved in determining // the visible devices so user set var do not mess up the logic below. + // KOKKOS_NUM_DEVICES and KOKKOS_SKIP_DEVICE are deprecated since 3.7 and are + // not taken into account anymore. KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, {"KOKKOS_SKIP_DEVICE", "1"}), @@ -452,10 +422,10 @@ TEST(defaultdevicetype, visible_devices) { ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, ), 6, DEV(2, 1)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_NUM_DEVICES", "3"}), 6, - DEV(0, 1, 2)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_NUM_DEVICES", "4"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6, - DEV(0, 2, 3)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_VISIBLE_DEVICES", "1,3,4"}), 6, DEV(1, 3, 4)); KOKKOS_TEST_VISIBLE_DEVICES( From 650ac40677765a7a8e6951528489a5ff75e9da58 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Jan 2024 10:40:45 -0500 Subject: [PATCH 242/432] Avoid unnecessary zero-memset of the scratch flags in SYCL (#6739) * SYCL: Homogenize scratch_flags with CUDA and HIP * Add comments for CUDA and HIP * Fix typo [ci skip] --------- Co-authored-by: Daniel Arndt --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 3 +++ core/src/HIP/Kokkos_HIP_Instance.cpp | 3 +++ core/src/SYCL/Kokkos_SYCL_Instance.cpp | 12 ++++++++---- core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp | 2 ++ core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 2 ++ core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 2 ++ core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 1 + 7 files changed, 21 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 1d61c9c5906..870284b3723 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -345,6 +345,9 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { m_scratchFlags = static_cast( mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_CUDA_SAFE_CALL( (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size))); } diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 3b5a1e0017c..d9fb99f1751 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -226,6 +226,9 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { m_scratchFlags = static_cast( mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size)); } diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 05b50d52534..f05deab54b0 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -288,12 +288,16 @@ sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { m_scratchFlagsCount, sizeScratchGrain); m_scratchFlags = static_cast(mem_space.allocate( "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); - } - auto memset_event = m_queue->memset(m_scratchFlags, 0, - m_scratchFlagsCount * sizeScratchGrain); + + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. + auto memset_event = m_queue->memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); + m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); #endif + } return m_scratchFlags; } diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 953d2235b31..f55280e22e3 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -234,6 +234,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_mem[local_id * value_count]); else { @@ -279,6 +280,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_value); else { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 7f29dcf9d9c..5333e3c8a83 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -177,6 +177,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -219,6 +220,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_value); else { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 01819b848af..27165c59e3a 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -229,6 +229,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -281,6 +282,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_value); else { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index d6f3219defd..977b69bc9eb 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -187,6 +187,7 @@ class ParallelScanSYCLBase { } item.barrier(sycl::access::fence_space::global_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; value_type total; reducer.init(&total); From 5403681145d2fa2ee99dc10a5f3d62e4dc036512 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Thu, 25 Jan 2024 15:47:37 +0000 Subject: [PATCH 243/432] std(remove-if): fixing tmp view alloc + avoid evaluating twice the predicate during final pass --- .../std_algorithms/impl/Kokkos_RemoveAllVariants.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp index 50224c8874e..456df43aed2 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp @@ -46,15 +46,14 @@ struct StdRemoveIfStage1Functor { void operator()(const IndexType i, IndexType& update, const bool final_pass) const { auto& myval = m_first_from[i]; - if (final_pass) { - if (!m_must_remove(myval)) { + + if (!m_must_remove(myval)) { + if (final_pass) { // calling move here is ok because we are inside final pass // we are calling move assign as specified by the std m_first_dest[update] = std::move(myval); } - } - if (!m_must_remove(myval)) { update += 1; } } @@ -108,7 +107,9 @@ IteratorType remove_if_exespace_impl(const std::string& label, // create helper tmp view using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count); + tmp_view_type tmp_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, ex, + "std_remove_if_tmp_view"), + keep_count); using tmp_readwrite_iterator_type = decltype(begin(tmp_view)); // in stage 1, *move* all elements to keep from original range to tmp From d2913cb3895c05c20055a86756f11e2c6b2b3bed Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 29 Jan 2024 15:44:54 -0500 Subject: [PATCH 244/432] Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` (#6713) * Make Kokkos::device_id consistent with KOKKOS_VISIBLE_DEVICES * Mask visible devices in CUDA/HIP::print_config * fixup! Make Kokkos::device_id consistent with KOKKOS_VISIBLE_DEVICES * Add Kokkos::num_devices() -> int * Let num_devices() return -1 when no device backend is enabled * Update device and threads unit test * Skip num_devices and device_id tests if KOKKOS_VISIBLE_DEVICES env var is defined * Fix device_id test with SYCL * Fix HIP test GetDevice[Count] * Enable device initialization testing for OpenMPTarget As far as I understand it was resolved in #5492 * Improve error message when the device id cannot be determined --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 5 +-- core/src/HIP/Kokkos_HIP_Instance.cpp | 6 ++-- core/src/Kokkos_Core.hpp | 1 + core/src/impl/Kokkos_Core.cpp | 35 ++++++++++++++---- core/src/impl/Kokkos_DeviceManagement.hpp | 4 +-- core/unit_test/CMakeLists.txt | 10 +++--- core/unit_test/TestDeviceAndThreads.py | 21 ++++++++--- core/unit_test/UnitTest_DeviceAndThreads.cpp | 37 +++++++++++--------- 8 files changed, 76 insertions(+), 43 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 870284b3723..43d4a70eab8 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -221,10 +221,7 @@ void CudaInternal::print_configuration(std::ostream &s) const { << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - - for (int i = 0; i < count; ++i) { + for (int i : get_visible_devices()) { cudaDeviceProp prop; KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability " diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index d9fb99f1751..74bab397429 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include /*--------------------------------------------------------------------------*/ @@ -89,10 +90,7 @@ void HIPInternal::print_configuration(std::ostream &s) const { << '\n'; #endif - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); - - for (int i = 0; i < hipDevCount; ++i) { + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); std::string gpu_type = hipProp.integrated == 1 ? "APU" : "dGPU"; diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index b4863620cde..f75f9069c31 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -102,6 +102,7 @@ void declare_configuration_metadata(const std::string& category, [[nodiscard]] bool is_finalized() noexcept; [[nodiscard]] int device_id() noexcept; +[[nodiscard]] int num_devices() noexcept; [[nodiscard]] int num_threads() noexcept; bool show_warnings() noexcept; diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 25217786471..0229da88923 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -167,20 +167,43 @@ bool is_valid_map_device_id_by(std::string const& x) { } // namespace +std::vector const& Kokkos::Impl::get_visible_devices() { + static auto devices = get_visible_devices(get_device_count()); + return devices; +} + [[nodiscard]] int Kokkos::device_id() noexcept { #if defined(KOKKOS_ENABLE_CUDA) - return Cuda().cuda_device(); + int device = Cuda().cuda_device(); #elif defined(KOKKOS_ENABLE_HIP) - return HIP().hip_device(); + int device = HIP().hip_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - return Experimental::OpenACC().acc_device_number(); + int device = Experimental::OpenACC().acc_device_number(); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_default_device(); // FIXME_OPENMPTARGET + int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - return Experimental::Impl::SYCLInternal::m_syclDev; + int device = Experimental::Impl::SYCLInternal::m_syclDev; #else - return -1; + int device = -1; + return device; #endif + auto const& visible_devices = Impl::get_visible_devices(); + for (std::size_t i = 0; i < visible_devices.size(); ++i) { + if (visible_devices[i] == device) { + return i; + } + } + Kokkos::abort("Unexpected error: cannot determine device id"); + return -1; +} + +[[nodiscard]] int Kokkos::num_devices() noexcept { + if constexpr (std::is_same_v) { + return -1; // no GPU backend enabled + } else { + return Impl::get_visible_devices().size(); + } } [[nodiscard]] int Kokkos::num_threads() noexcept { diff --git a/core/src/impl/Kokkos_DeviceManagement.hpp b/core/src/impl/Kokkos_DeviceManagement.hpp index a8ec208c948..5783cb3d79e 100644 --- a/core/src/impl/Kokkos_DeviceManagement.hpp +++ b/core/src/impl/Kokkos_DeviceManagement.hpp @@ -25,8 +25,8 @@ namespace Impl { int get_gpu(const Kokkos::InitializationSettings& settings); // This declaration is provided for testing purposes only int get_ctest_gpu(int local_rank); -// ditto -std::vector get_visible_devices(int device_count); +std::vector get_visible_devices(int device_count); // test-only +std::vector const& get_visible_devices(); // use this instead } // namespace Impl } // namespace Kokkos diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 45e002dd63f..dc982928b09 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -1235,12 +1235,10 @@ if (NOT KOKKOS_HAS_TRILINOS) INPUT TestDeviceAndThreads.py ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED} ) - if(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET does not select the right device - add_test( - NAME Kokkos_CoreUnitTest_DeviceAndThreads - COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py - ) - endif() + add_test( + NAME Kokkos_CoreUnitTest_DeviceAndThreads + COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py + ) endif() endif() diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 95727dad85c..63d26ad41a4 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -18,6 +18,7 @@ import unittest import subprocess import platform +import os PREFIX = "$" EXECUTABLE = "$" @@ -64,13 +65,25 @@ def test_num_threads(self): "num_threads", "--kokkos-num-threads={}".format(num_threads))) + def test_num_devices(self): + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + self.assertNotEqual(num_devices, 0) + if num_devices == -1: + self.skipTest("no device backend enabled") + self.assertGreaterEqual(num_devices, 1) + def test_device_id(self): - device_count = GetFlag("device_count") - if device_count == 0: - self.skipTest("no device detected") + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + if num_devices == -1: + self.assertEqual(-1, GetFlag("device_id")) + self.skipTest("no device backend enabled") # by default use the first GPU available for execution self.assertEqual(0, GetFlag("device_id")) - for device_id in range(device_count): + for device_id in range(num_devices): self.assertEqual( device_id, GetFlag( diff --git a/core/unit_test/UnitTest_DeviceAndThreads.cpp b/core/unit_test/UnitTest_DeviceAndThreads.cpp index ea944bae4cd..210df501201 100644 --- a/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -19,22 +19,23 @@ #include #include -int get_device_count() { +int get_num_devices() { + int num_devices; #if defined(KOKKOS_ENABLE_CUDA) - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - return count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_HIP) - int count; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&count)); - return count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_num_devices(); + num_devices = omp_get_num_devices(); #elif defined(KOKKOS_ENABLE_OPENACC) - return acc_get_num_devices(acc_get_device_type()); + num_devices = acc_get_num_devices(acc_get_device_type()); +#elif defined(KOKKOS_ENABLE_SYCL) + num_devices = sycl::device::get_devices(sycl::info::device_type::gpu).size(); #else - return 0; + num_devices = -1; #endif + assert(num_devices == Kokkos::num_devices()); + return num_devices; } int get_device_id() { @@ -44,15 +45,17 @@ int get_device_id() { #elif defined(KOKKOS_ENABLE_HIP) KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - device_id = omp_get_device_num(); + device_id = omp_get_device_num(); #elif defined(KOKKOS_ENABLE_OPENACC) - device_id = acc_get_device_num(acc_get_device_type()); + device_id = acc_get_device_num(acc_get_device_type()); #elif defined(KOKKOS_ENABLE_SYCL) - // FIXME_SYCL ? - assert(false); - return -2; + // Not able to query the underlying runtime because there is no such thing as + // device currently being used with SYCL. We go through the Kokkos runtime + // which makes the assert below pointless but it still let us check that + // Kokkos selected the device we asked for from the Python tests. + device_id = Kokkos::device_id(); #else - device_id = -1; + device_id = -1; #endif assert(device_id == Kokkos::device_id()); return device_id; @@ -98,7 +101,7 @@ int print_flag(std::string const& flag) { KOKKOS_TEST_PRINT_FLAG(num_threads); KOKKOS_TEST_PRINT_FLAG(max_threads); KOKKOS_TEST_PRINT_FLAG(device_id); - KOKKOS_TEST_PRINT_FLAG(device_count); + KOKKOS_TEST_PRINT_FLAG(num_devices); KOKKOS_TEST_PRINT_FLAG(disable_warnings); KOKKOS_TEST_PRINT_FLAG(tune_internals); KOKKOS_TEST_PRINT_FLAG(hwloc_enabled); From b3d8643e83c4b0f4ebd9f1ccc6bb380b0b8f6090 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 30 Jan 2024 16:12:15 -0500 Subject: [PATCH 245/432] Drop CudaInternal::cuda_get_last_error_wrapper() Co-Authored-By: Daniel Arndt --- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 6 ------ core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 3 +-- core/src/Cuda/Kokkos_Cuda_Task.hpp | 13 +++++-------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 9c452573a51..db764fb1b9a 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -264,12 +264,6 @@ class CudaInternal { return cudaFreeHost(ptr); } - template - cudaError_t cuda_get_last_error_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetLastError(); - } - template cudaError_t cuda_graph_add_dependencies_wrapper( cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b68eec13a01..0dbe40d8942 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -675,8 +675,7 @@ struct CudaParallelLaunchImpl< base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_instance->cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); cuda_instance->fence( "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error"); #endif diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp index 19179ce5c0c..f25170fced7 100644 --- a/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -264,8 +264,7 @@ class TaskQueueSpecialization> { cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization> { set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization::execute: Post Get Function Pointer for Tasks"); @@ -505,8 +504,7 @@ class TaskQueueSpecializationConstrained< cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained::get_function_pointer: Post Get Function Pointer"); From a082f820d557cdd4921cd893ba0c54d0fe7a211b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 30 Jan 2024 17:18:19 -0500 Subject: [PATCH 246/432] Avoid calling wrapper functions with singleton in some classes --- core/src/Cuda/Kokkos_CudaSpace.cpp | 27 +++++++++++++------------- core/src/Cuda/Kokkos_Cuda_Error.hpp | 4 ---- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 12 ------------ 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index 9cd074df4fd..e3dbd4a6e13 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -83,11 +83,11 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { KOKKOS_IMPL_CUDA_SAFE_CALL( (CudaInternal::singleton().cuda_memcpy_async_wrapper( dst, src, n, cudaMemcpyDefault, s))); - Impl::cuda_stream_synchronize( - s, + Kokkos::Tools::Experimental::Impl::profile_fence_event( + "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync", Kokkos::Tools::Experimental::SpecialSynchronizationCases:: DeepCopyResourceSynchronization, - "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync"); + [&]() { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); }); } } // namespace Impl @@ -349,18 +349,17 @@ void CudaSpace::impl_deallocate( if (arg_alloc_size >= memory_threshold_g) { Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence before async free"); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_async_wrapper( - arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream)); Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence after async free"); } else { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } #else - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #endif } catch (...) { } @@ -392,8 +391,8 @@ void CudaUVMSpace::impl_deallocate( try { if (arg_alloc_ptr != nullptr) { Kokkos::Impl::num_uvm_allocations--; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } } catch (...) { } @@ -423,8 +422,8 @@ void CudaHostPinnedSpace::impl_deallocate( reported_size); } try { - KOKKOS_IMPL_CUDA_SAFE_CALL(( - Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); } catch (...) { } } diff --git a/core/src/Cuda/Kokkos_Cuda_Error.hpp b/core/src/Cuda/Kokkos_Cuda_Error.hpp index f68e05f7804..c4458c910ca 100644 --- a/core/src/Cuda/Kokkos_Cuda_Error.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -27,10 +27,6 @@ namespace Kokkos { namespace Impl { -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string& name); void cuda_device_synchronize(const std::string& name); void cuda_stream_synchronize(const cudaStream_t stream, const std::string& name); diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 6c60532705a..9200d0263b5 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -168,18 +168,6 @@ void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, }); } -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string &name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, reason, [&]() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_synchronize_wrapper( - stream))); - }); -} - void cuda_internal_error_throw(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; From eecd917f660702d435db456c7180aef411bf7339 Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Tue, 30 Jan 2024 20:06:07 -0500 Subject: [PATCH 247/432] Change the default execution policy behavior of the OpenACC backend from synchronous to asynchronous executions. - Change the default OpenACC async_arg value from acc_async_sync to acc_async_noval. - Add acc_wait(async_arg) to scalar reduction operations (parallel_reduce()). --- core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp | 26 ++++++++++++------- core/src/OpenACC/Kokkos_OpenACC_Instance.hpp | 4 +-- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 10 +++++++ .../Kokkos_OpenACC_ParallelReduce_Range.hpp | 2 ++ .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 1 + 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp b/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp index 4aed7e00f76..ca022192b0b 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp @@ -34,7 +34,7 @@ struct Kokkos::Impl::DeepCopy 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -52,7 +52,7 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -60,7 +60,7 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } }; @@ -70,7 +70,9 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_to_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, const void* src, size_t n) { @@ -85,7 +87,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -93,7 +96,8 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; @@ -104,7 +108,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -120,14 +125,17 @@ template struct Kokkos::Impl::DeepCopy< Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_from_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index 6645616ba51..c3d72368727 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -35,7 +35,7 @@ class OpenACCInternal { public: static int m_acc_device_num; - int m_async_arg = acc_async_sync; + int m_async_arg = acc_async_noval; OpenACCInternal() = default; @@ -43,7 +43,7 @@ class OpenACCInternal { bool verify_is_initialized(const char* const label) const; - void initialize(int async_arg = acc_async_sync); + void initialize(int async_arg = acc_async_noval); void finalize(); bool is_initialized() const; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index b02ad8dfd95..5afb5e75d39 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -136,6 +136,7 @@ class Kokkos::Impl::ParallelReduce Date: Tue, 30 Jan 2024 17:25:42 -0500 Subject: [PATCH 248/432] Avoid calling wrapper functions with singleton in Kokkos_Cuda_Task.cpp --- core/src/Cuda/Kokkos_Cuda_Task.hpp | 40 ++++++++++++++---------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp index f25170fced7..86d6d91bbee 100644 --- a/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -225,8 +225,10 @@ class TaskQueueSpecialization> { // FIXME_CUDA_MULTIPLE_DEVICES static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; + const Kokkos::Cuda& exec = scheduler.get_execution_space(); + const auto& impl_instance = exec.impl_internal_space_instance(); const int multi_processor_count = - scheduler.get_execution_space().cuda_device_prop().multiProcessorCount; + exec.cuda_device_prop().multiProcessorCount; const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; @@ -247,18 +249,16 @@ class TaskQueueSpecialization> { // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 1 << 11; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( @@ -271,9 +271,8 @@ class TaskQueueSpecialization> { "Cuda>::execute: Post Task Execution"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -467,10 +466,12 @@ class TaskQueueSpecializationConstrained< static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; const int warps_per_block = 4; + const Kokkos::Cuda exec = Cuda(); // FIXME_CUDA_MULTIPLE_DEVICES + const auto& impl_instance = exec.impl_internal_space_instance(); const int multi_processor_count = // FIXME not sure why this didn't work - // scheduler.get_execution_space().cuda_device_prop().multiProcessorCount; - CudaInternal::singleton().m_deviceProp.multiProcessorCount; + // exec.cuda_device_prop().multiProcessorCount; + impl_instance->m_deviceProp.multiProcessorCount; const dim3 grid(multi_processor_count, 1, 1); // const dim3 grid( 1 , 1 , 1 ); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); @@ -487,18 +488,16 @@ class TaskQueueSpecializationConstrained< // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 2048; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( @@ -511,9 +510,8 @@ class TaskQueueSpecializationConstrained< "Kokkos::Cuda>::execute: Post Execute Task"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } From 71b246d674600c7f83bc553f3ce5fe7dd6216d2b Mon Sep 17 00:00:00 2001 From: ajpowelsnl <49000089+ajpowelsnl@users.noreply.github.com> Date: Wed, 31 Jan 2024 05:18:14 -0700 Subject: [PATCH 249/432] Deprecate `in_parallel` (#6032) (#6582) * in_parallel: rebase on upstream/develop, resolve conflicts, clang-formatting * rebase on upstream/develop, resolve conflicts, clang format * Apply clang-formatting (clang-8.0.1), mv execute_in_serial function * OpenMP fixes, clang-8 formatting * Kokkos_OpenMP_Instance.hpp: add ";" to end of class definition * Address round 3 of DA comments * Fix typos in deprecation markings * Fix CI failures in OpenACC, OpenMP, Threads backends * rm conditional statement * rm additional `in_parallel` statements * Correctly flag namespace brace * Address most recent DA comments * rm brace at 3108 * rm else branch; change "==" to "=" in TestExecutionSpace * Add deprecation marking for all `in_parallel`-requiring tests * TestHPX_InParallel.cpp: rm unused variable "n" * TestHPX_InParallel.cpp: put "n" inside of Deprecaton marking * Add dep. marking to `impl_in_parallel_scope`-type statements * Add deprecation marking to all impl_not_in_parallel_scope * Address DLG comments * rm errant "e += i" * rm second definition of "execute_in_serial" * Address DA comments * Fix up function call operator * Addressing final DA comments * Apply deprecation markings uniformly * Put deprecation markings around braced clauses * Apply suggestions from review --------- Co-authored-by: Damien L-G --- core/src/Cuda/Kokkos_Cuda.hpp | 5 +- core/src/HIP/Kokkos_HIP.hpp | 4 +- core/src/HPX/Kokkos_HPX.cpp | 2 + core/src/HPX/Kokkos_HPX.hpp | 28 +++++- core/src/OpenACC/Kokkos_OpenACC.hpp | 6 +- core/src/OpenMP/Kokkos_OpenMP.cpp | 6 +- core/src/OpenMP/Kokkos_OpenMP.hpp | 4 +- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 32 +++---- core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 6 +- core/src/SYCL/Kokkos_SYCL.hpp | 4 +- core/src/Serial/Kokkos_Serial.hpp | 5 +- core/src/Threads/Kokkos_Threads.hpp | 4 +- core/src/Threads/Kokkos_Threads_Instance.cpp | 4 +- core/src/Threads/Kokkos_Threads_Instance.hpp | 8 +- core/src/impl/Kokkos_ViewMapping.hpp | 96 +++++++++---------- core/unit_test/CMakeLists.txt | 2 + core/unit_test/TestExecutionSpace.hpp | 8 +- .../incremental/Test01_execspace.hpp | 2 + 18 files changed, 136 insertions(+), 90 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 90e982474df..276d03da265 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -128,13 +128,16 @@ class Cuda { /// \brief True if and only if this method is being called in a /// thread-parallel function. - KOKKOS_INLINE_FUNCTION static int in_parallel() { + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__CUDA_ARCH__) return true; #else return false; #endif } +#endif /// \brief Wait until all dispatched functors complete. /// diff --git a/core/src/HIP/Kokkos_HIP.hpp b/core/src/HIP/Kokkos_HIP.hpp index 11beb48852c..dd210b3874c 100644 --- a/core/src/HIP/Kokkos_HIP.hpp +++ b/core/src/HIP/Kokkos_HIP.hpp @@ -57,13 +57,15 @@ class HIP { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__HIP_DEVICE_COMPILE__) return true; #else return false; #endif } +#endif /** \brief Wait until all dispatched functors complete. * diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 4a40ffcaa4f..6d541a64148 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -103,6 +103,7 @@ void HPX::print_configuration(std::ostream &os, const bool) const { os << hpx::configuration_string() << '\n'; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 bool &HPX::impl_get_in_parallel() noexcept { static thread_local bool in_parallel = false; return in_parallel; @@ -127,6 +128,7 @@ HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept { KOKKOS_EXPECTS(!impl_get_in_parallel()); impl_get_in_parallel() = true; } +#endif void HPX::impl_decrement_active_parallel_region_count() { std::unique_lock l(m_active_parallel_region_count_mutex); diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index 9438a817408..e444f746e7a 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -201,6 +201,7 @@ class HPX { return impl_get_instance_data().m_instance_id; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static bool &impl_get_in_parallel() noexcept; struct impl_in_parallel_scope { @@ -223,9 +224,10 @@ class HPX { delete; }; - static bool in_parallel(HPX const & = HPX()) noexcept { + KOKKOS_DEPRECATED static bool in_parallel(HPX const & = HPX()) noexcept { return impl_get_in_parallel(); } +#endif static void impl_decrement_active_parallel_region_count(); static void impl_increment_active_parallel_region_count(); @@ -343,7 +345,9 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_plain_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, n, stacksize); @@ -405,15 +409,21 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.setup(); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.finalize(); }}, n, stacksize); @@ -1280,6 +1290,7 @@ class ParallelScan, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1287,6 +1298,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1308,6 +1322,7 @@ class ParallelScan, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1315,6 +1330,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( @@ -1395,6 +1413,7 @@ class ParallelScanWithTotal, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1402,6 +1421,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1423,6 +1445,7 @@ class ParallelScanWithTotal, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1430,6 +1453,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( diff --git a/core/src/OpenACC/Kokkos_OpenACC.hpp b/core/src/OpenACC/Kokkos_OpenACC.hpp index b012f6a42a4..5155bee33dc 100644 --- a/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -91,7 +91,11 @@ class OpenACC { #else int concurrency() const { return 256000; } // FIXME_OPENACC #endif - static bool in_parallel() { return acc_on_device(acc_device_not_host); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool in_parallel() { + return acc_on_device(acc_device_not_host); + } +#endif uint32_t impl_instance_id() const noexcept; Impl::OpenACCInternal* impl_internal_space_instance() const { return m_space_instance.get(); diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index 245e1bfb3af..81f2c5c3056 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -81,12 +81,14 @@ bool OpenMP::impl_is_initialized() noexcept { return Impl::OpenMPInternal::singleton().is_initialized(); } -bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); } +#endif int OpenMP::impl_thread_pool_size() const noexcept { - return OpenMP::in_parallel(*this) + return (impl_internal_space_instance()->get_level() < omp_get_level()) ? omp_get_num_threads() : impl_internal_space_instance()->m_pool_size; } diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index 9ee2291c029..404076d0111 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -82,8 +82,10 @@ class OpenMP { /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief is the instance running a parallel algorithm - static bool in_parallel(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED static bool in_parallel(OpenMP const& = OpenMP()) noexcept; +#endif /// \brief Wait until all dispatched functors complete on the given instance /// diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index f4b753593d4..35b9aa93ba7 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -41,23 +41,6 @@ #include /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { - -inline bool execute_in_serial(OpenMP const& space = OpenMP()) { -// The default value returned by `omp_get_max_active_levels` with gcc version -// lower than 11.1.0 is 2147483647 instead of 1. -#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \ - _OPENMP >= 201511 - bool is_nested = omp_get_max_active_levels() > 1; -#else - bool is_nested = static_cast(omp_get_nested()); -#endif - return (OpenMP::in_parallel(space) && !(is_nested && (omp_get_level() == 1))); -} - -} // namespace Impl -} // namespace Kokkos namespace Kokkos { namespace Impl { @@ -117,6 +100,8 @@ class OpenMPInternal { return m_pool[i]; } + int get_level() const { return m_level; } + bool is_initialized() const { return m_initialized; } bool verify_is_initialized(const char* const label) const; @@ -124,6 +109,19 @@ class OpenMPInternal { void print_configuration(std::ostream& s) const; }; +inline bool execute_in_serial(OpenMP const& space = OpenMP()) { +// The default value returned by `omp_get_max_active_levels` with gcc version +// lower than 11.1.0 is 2147483647 instead of 1. +#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \ + _OPENMP >= 201511 + bool is_nested = omp_get_max_active_levels() > 1; +#else + bool is_nested = static_cast(omp_get_nested()); +#endif + return (space.impl_internal_space_instance()->get_level() < omp_get_level() && + !(is_nested && (omp_get_level() == 1))); +} + } // namespace Impl namespace Experimental { diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index adf972dd081..ea4e7f6baba 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -65,7 +65,11 @@ class OpenMPTarget { using scratch_memory_space = ScratchMemorySpace; - inline static bool in_parallel() { return omp_in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static bool in_parallel() { + return omp_in_parallel(); + } +#endif static void fence(const std::string& name = "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence"); diff --git a/core/src/SYCL/Kokkos_SYCL.hpp b/core/src/SYCL/Kokkos_SYCL.hpp index 8de860c87f6..47756b039bd 100644 --- a/core/src/SYCL/Kokkos_SYCL.hpp +++ b/core/src/SYCL/Kokkos_SYCL.hpp @@ -78,13 +78,15 @@ class SYCL { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__SYCL_DEVICE_ONLY__) return true; #else return false; #endif } +#endif /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); diff --git a/core/src/Serial/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp index 67119cac164..43eb4992ed7 100644 --- a/core/src/Serial/Kokkos_Serial.hpp +++ b/core/src/Serial/Kokkos_Serial.hpp @@ -121,7 +121,10 @@ class Serial { /// For the Serial device, this method always returns false, /// because parallel_for or parallel_reduce with the Serial device /// always execute sequentially. - inline static int in_parallel() { return false; } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static int in_parallel() { return false; } +#endif /// \brief Wait until all dispatched functors complete. /// diff --git a/core/src/Threads/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp index 36a66230be0..31653c46cac 100644 --- a/core/src/Threads/Kokkos_Threads.hpp +++ b/core/src/Threads/Kokkos_Threads.hpp @@ -64,7 +64,9 @@ class Threads { /// \brief True if and only if this method is being called in a /// thread-parallel function. - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; diff --git a/core/src/Threads/Kokkos_Threads_Instance.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp index 49408b89916..3842966cd77 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -241,13 +241,15 @@ void ThreadsInternal::verify_is_process(const std::string &name, } } -int ThreadsInternal::in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED int ThreadsInternal::in_parallel() { // A thread function is in execution and // the function argument is not the special threads process argument and // the master process is a worker or is not the master process. return s_current_function && (&s_threads_process != s_current_function_arg) && (s_threads_process.m_pool_base || !is_process()); } +#endif void ThreadsInternal::fence() { fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence"); } diff --git a/core/src/Threads/Kokkos_Threads_Instance.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp index b79b527940a..a5eb231cb01 100644 --- a/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -402,7 +402,9 @@ class ThreadsInternal { */ static void start(void (*)(ThreadsInternal &, const void *), const void *); - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif static void fence(); static void fence(const std::string &); static void internal_fence(); @@ -544,9 +546,11 @@ class ThreadsInternal { namespace Kokkos { -inline int Threads::in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline int Threads::in_parallel() { return Impl::ThreadsInternal::in_parallel(); } +#endif inline int Threads::impl_is_initialized() { return Impl::ThreadsInternal::is_initialized(); diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 16ca33a87d0..6a8ac52fc3c 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -2947,37 +2947,33 @@ struct ViewValueFunctor { template void parallel_for_implementation() { - if (!space.in_parallel()) { - using PolicyType = - Kokkos::RangePolicy, Tag>; - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - const std::string functor_name = - (std::is_same_v - ? "Kokkos::View::destruction [" + name + "]" - : "Kokkos::View::initialization [" + name + "]"); - Kokkos::Profiling::beginParallelFor( - functor_name, Kokkos::Profiling::Experimental::device_id(space), - &kpID); - } + using PolicyType = + Kokkos::RangePolicy, Tag>; + PolicyType policy(space, 0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + const std::string functor_name = + (std::is_same_v + ? "Kokkos::View::destruction [" + name + "]" + : "Kokkos::View::initialization [" + name + "]"); + Kokkos::Profiling::beginParallelFor( + functor_name, Kokkos::Profiling::Experimental::device_id(space), + &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(Tag{}, i); + const Kokkos::Impl::ParallelFor closure( + *this, policy); + closure.execute(); + if (default_exec_space || std::is_same_v) + space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } @@ -3084,32 +3080,28 @@ struct ViewValueFunctor { } void parallel_for_implementation() { - if (!space.in_parallel()) { - PolicyType policy(0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } + PolicyType policy(0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "]", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, PolicyType(0, n)); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(i); + const Kokkos::Impl::ParallelFor closure( + *this, PolicyType(0, n)); + closure.execute(); + if (default_exec_space) + space.fence( + "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " + "view"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index dc982928b09..f5985b5ff89 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -709,12 +709,14 @@ if(Kokkos_ENABLE_HPX) hpx/TestHPX_IndependentInstancesRefCounting.cpp hpx/TestHPX_IndependentInstancesSynchronization.cpp ) +if(Kokkos_ENABLE_DEPRECATED_CODE_4) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HPX_InParallel SOURCES UnitTestMainInit.cpp hpx/TestHPX_InParallel.cpp ) + endif() endif() if(Kokkos_ENABLE_OPENMPTARGET) diff --git a/core/unit_test/TestExecutionSpace.hpp b/core/unit_test/TestExecutionSpace.hpp index 6f0f159c174..983a5975afd 100644 --- a/core/unit_test/TestExecutionSpace.hpp +++ b/core/unit_test/TestExecutionSpace.hpp @@ -25,13 +25,7 @@ struct CheckClassWithExecutionSpaceAsDataMemberIsCopyable { Kokkos::DefaultExecutionSpace device; Kokkos::DefaultHostExecutionSpace host; - KOKKOS_FUNCTION void operator()(int, int& e) const { - // not actually doing anything useful, mostly checking that - // ExecutionSpace::in_parallel() is callable - if (static_cast(device.in_parallel()) < 0) { - ++e; - } - } + KOKKOS_FUNCTION void operator()(int i, int& e) const { e += i; } CheckClassWithExecutionSpaceAsDataMemberIsCopyable() { int errors; diff --git a/core/unit_test/incremental/Test01_execspace.hpp b/core/unit_test/incremental/Test01_execspace.hpp index 25c7138ed3c..d7b2a57b442 100644 --- a/core/unit_test/incremental/Test01_execspace.hpp +++ b/core/unit_test/incremental/Test01_execspace.hpp @@ -62,8 +62,10 @@ struct TestIncrExecSpace { auto concurrency = ExecSpace().concurrency(); ASSERT_GT(concurrency, 0); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int in_parallel = ExecSpace::in_parallel(); ASSERT_FALSE(in_parallel); +#endif const char* name = ExecSpace::name(); std::cout << name << std::endl; From bbb895a347f4c511a9fc85193ca554f105e99739 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Wed, 31 Jan 2024 05:23:45 -0700 Subject: [PATCH 250/432] Remove redundant calls in rangepolicy constructors (#6765) * Removed redudant calls from RangePolicy constructors * Fixed to call the right constructor --- core/src/Kokkos_ExecPolicy.hpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index 025a2795fb3..979db33a3e1 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -127,9 +127,7 @@ class RangePolicy : public Impl::PolicyTraits { /** \brief Total range */ inline RangePolicy(const member_type work_begin, const member_type work_end) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - } + : RangePolicy(typename traits::execution_space(), work_begin, work_end) {} /** \brief Total range */ template @@ -150,11 +148,8 @@ class RangePolicy : public Impl::PolicyTraits { template inline RangePolicy(const member_type work_begin, const member_type work_end, Args... args) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - check_bounds_validity(); - set_auto_chunk_size(); - set(args...); - } + : RangePolicy(typename traits::execution_space(), work_begin, work_end, + args...) {} private: inline void set() {} From 408e8be5b943cd21092a7294a0fb9b8ecefb648e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 31 Jan 2024 06:49:22 -0700 Subject: [PATCH 251/432] OpenMPTarget on Intel GPUs update (#6735) * Update linker flags for OpenMPTarget on Intel GPUs * Disable failing tests with OpenMPTarget on Intel GPUs * Fix typo * Move -D__STRICT_ANSI__ from linker to compiler flags * Improve FIXMEs --- .../TestStdAlgorithmsTeamCopyIf.cpp | 4 + .../TestStdAlgorithmsTeamExclusiveScan.cpp | 4 + .../TestStdAlgorithmsTeamInclusiveScan.cpp | 4 + .../TestStdAlgorithmsTeamRemoveCopy.cpp | 4 + .../TestStdAlgorithmsTeamRemoveCopyIf.cpp | 4 + .../TestStdAlgorithmsTeamUniqueCopy.cpp | 4 + cmake/kokkos_arch.cmake | 49 ++++++------ .../TestMathematicalSpecialFunctions.hpp | 30 +++++++ core/unit_test/TestNumericTraits.hpp | 79 +++++++++++-------- 9 files changed, 129 insertions(+), 53 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index b5aa27c7c38..7c3c465dc8d 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -166,6 +166,10 @@ void run_all_scenarios() { } TEST(std_algorithms_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 4fa4e624db1..2c8fee02f47 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -254,6 +254,10 @@ void run_all_scenarios() { } TEST(std_algorithms_exclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp index 642a8494390..b5f4cdd6123 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -279,6 +279,10 @@ void run_all_scenarios() { } TEST(std_algorithms_inclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 24b840154b7..6bb0d249988 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -212,6 +212,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index ce18eb4d319..cff9aa178a2 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -168,6 +168,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 87687b60a16..0d3289e196f 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -186,6 +186,10 @@ void run_all_scenarios() { } TEST(std_algorithms_unique_copy_team_test, test) { + // FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 7c834f2b0dd..575c9b70b1c 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -776,30 +776,35 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) COMPILER_SPECIFIC_FLAGS( IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" -D__STRICT_ANSI__ + ELSE() + COMPILER_SPECIFIC_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" -D__STRICT_ANSI__ + IF(KOKKOS_ARCH_INTEL_GEN9) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN11) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" + ) + ELSEIF(KOKKOS_ARCH_INTEL_DG1) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" + ) + ELSEIF(KOKKOS_ARCH_INTEL_XEHP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" + ) + ELSEIF(KOKKOS_ARCH_INTEL_PVC) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" ) + ENDIF() ENDIF() ENDIF() diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp index b90055fd71e..7969dc86864 100644 --- a/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -1967,31 +1967,61 @@ TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) { #endif TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ0Y0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ1Y1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI0K0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI1K1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH2Function test; test.testit(); } diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index f197a2d8891..81a9d0a5e0d 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -210,9 +210,10 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -224,9 +225,9 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -239,9 +240,9 @@ TEST(TEST_CATEGORY, numeric_traits_round_error) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -253,9 +254,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -263,9 +264,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { TEST(TEST_CATEGORY, numeric_traits_denorm_min) { TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -302,8 +303,10 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -326,8 +329,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -349,8 +354,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -358,8 +365,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TEST(TEST_CATEGORY, numeric_traits_max_digits10) { TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -380,8 +389,10 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -395,8 +406,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -407,8 +420,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -426,8 +441,10 @@ TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif From 7d5fff958313c8d7bceec4cef8736b8c8ffd189f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 31 Jan 2024 09:05:48 -0500 Subject: [PATCH 252/432] Get rid of print statements in parallel algorithms unit tests Spotted ``` 27: [----------] 3 tests from std_algorithms_reducers 27: [ RUN ] std_algorithms_reducers.max_first_loc 27: checking reduction with order: LeftToRight 27: checking reduction with order: RightToLeft 27: checking reduction with order: Random 27: [ OK ] std_algorithms_reducers.max_first_loc (0 ms) 27: [ RUN ] std_algorithms_reducers.min_first_loc 27: checking reduction with order: LeftToRight 27: checking reduction with order: RightToLeft 27: checking reduction with order: Random 27: [ OK ] std_algorithms_reducers.min_first_loc (0 ms) 27: [ RUN ] std_algorithms_reducers.min_max_first_last_loc 27: checking reduction with order: LeftToRight 27: checking reduction with order: RightToLeft 27: checking reduction with order: Random 27: [ OK ] std_algorithms_reducers.min_max_first_last_loc (0 ms) 27: [----------] 3 tests from std_algorithms_reducers (0 ms total) ``` and ``` 31: [----------] 2 tests from std_algorithms_sorting_ops_test 31: [ RUN ] std_algorithms_sorting_ops_test.is_sorted 31: is_sorted: dynamic_view, all overloads 31: is_sorted: stride2_view, all overloads 31: is_sorted: stride3_view, all overloads 31: [ OK ] std_algorithms_sorting_ops_test.is_sorted (9 ms) 31: [ RUN ] std_algorithms_sorting_ops_test.is_sorted_until 31: is_sorted_until: dynamic_view, all overloads 31: is_sorted_until: stride2_view, all overloads 31: is_sorted_until: stride3_view, all overloads 31: [ OK ] std_algorithms_sorting_ops_test.is_sorted_until (4 ms) 31: [----------] 2 tests from std_algorithms_sorting_ops_test (14 ms total) ``` in the build logs --- .../unit_tests/TestStdAlgorithmsIsSorted.cpp | 7 ++-- .../TestStdAlgorithmsIsSortedUntil.cpp | 19 +++++------ algorithms/unit_tests/TestStdReducers.cpp | 32 +++++++++---------- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index f31d49e06b4..75d4f0afebc 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -146,7 +146,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsA[3] = KE::is_sorted("label", exespace(), view); const auto allA = std::all_of(resultsA.cbegin(), resultsA.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allA); + EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -159,7 +159,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allB); + EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{}); #endif Kokkos::fence(); @@ -173,9 +173,6 @@ void run_is_sorted_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index dcfe8ad67e1..29ac7cc9bc1 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) { KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view)); auto r3 = KE::is_sorted_until(exespace(), view); auto r4 = KE::is_sorted_until("label", exespace(), view); - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) { auto r8 = KE::is_sorted_until("label", exespace(), view, comp); #endif - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); Kokkos::fence(); } @@ -176,9 +176,6 @@ void run_is_sorted_until_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted_until: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/algorithms/unit_tests/TestStdReducers.cpp b/algorithms/unit_tests/TestStdReducers.cpp index 3847e1e6a36..c05006a1617 100644 --- a/algorithms/unit_tests/TestStdReducers.cpp +++ b/algorithms/unit_tests/TestStdReducers.cpp @@ -83,9 +83,6 @@ auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = std::conditional_t< (flag == 0), Kokkos::MaxFirstLoc, @@ -132,18 +129,24 @@ TEST(std_algorithms_reducers, max_first_loc) { const auto pair1 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::LeftToRight); - ASSERT_EQ(pair1.first, gold_value); - ASSERT_EQ(pair1.second, gold_location); + ASSERT_EQ(pair1.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); + ASSERT_EQ(pair1.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); const auto pair2 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::RightToLeft); - ASSERT_EQ(pair2.first, gold_value); - ASSERT_EQ(pair2.second, gold_location); + ASSERT_EQ(pair2.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); + ASSERT_EQ(pair2.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); const auto pair3 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::Random); - ASSERT_EQ(pair3.first, gold_value); - ASSERT_EQ(pair3.second, gold_location); + ASSERT_EQ(pair3.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::Random); + ASSERT_EQ(pair3.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::Random); } TEST(std_algorithms_reducers, min_first_loc) { @@ -191,9 +194,6 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = Kokkos::MinMaxFirstLastLoc; @@ -212,10 +212,10 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, reduction_value_type{view(index), view(index), index, index}); } - ASSERT_EQ(red_result.min_val, gold_values.first); - ASSERT_EQ(red_result.max_val, gold_values.second); - ASSERT_EQ(red_result.min_loc, gold_locs.first); - ASSERT_EQ(red_result.max_loc, gold_locs.second); + ASSERT_EQ(red_result.min_val, gold_values.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_val, gold_values.second) << order_to_string(enValue); + ASSERT_EQ(red_result.min_loc, gold_locs.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_loc, gold_locs.second) << order_to_string(enValue); } TEST(std_algorithms_reducers, min_max_first_last_loc) { From 729940c8792adaa920f09179db6656706089a00f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 31 Jan 2024 11:55:39 -0500 Subject: [PATCH 253/432] Attempt to fix device id test with OpenMPTarget --- core/unit_test/UnitTest_DeviceAndThreads.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/UnitTest_DeviceAndThreads.cpp b/core/unit_test/UnitTest_DeviceAndThreads.cpp index 210df501201..25442146fba 100644 --- a/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -45,7 +45,7 @@ int get_device_id() { #elif defined(KOKKOS_ENABLE_HIP) KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - device_id = omp_get_device_num(); + device_id = omp_get_default_device(); #elif defined(KOKKOS_ENABLE_OPENACC) device_id = acc_get_device_num(acc_get_device_type()); #elif defined(KOKKOS_ENABLE_SYCL) From af806fb5de470944631ded4a62492d90bde3a6c8 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 31 Jan 2024 12:57:17 -0500 Subject: [PATCH 254/432] Drop 2-arguments `ZeroMemset` constructor overloads (#6764) * contiguous_fill_or_memset should pass a default-constructed exec space to ZeroMemset * Rely on CTAD with ZeroMemset No one wants to see these template parameters * Drop 2-arguments ZeroMemset constructor overloads * Make sure to include what we use ( for std::memset) * Fix unused variable warning --- core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp | 9 --------- core/src/HIP/Kokkos_HIP_ZeroMemset.hpp | 7 ------- core/src/Kokkos_CopyViews.hpp | 7 ++++--- core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp | 6 ------ core/src/Serial/Kokkos_Serial_ZeroMemset.hpp | 13 ++++++------- core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 6 ------ core/src/impl/Kokkos_ViewMapping.hpp | 8 ++------ 7 files changed, 12 insertions(+), 44 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index c7f0d12d914..fc257fbe728 100644 --- a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -33,15 +33,6 @@ struct ZeroMemset> { dst.data(), 0, dst.size() * sizeof(typename View::value_type)))); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - // FIXME_CUDA_MULTIPLE_DEVICES - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Kokkos::Impl::CudaInternal::singleton().cuda_memset_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); - } }; } // namespace Impl diff --git a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 5c40d0fbc8d..ac5ff463cfa 100644 --- a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -31,13 +31,6 @@ struct ZeroMemset> { dst.data(), 0, dst.size() * sizeof(typename View::value_type), exec_space.hip_stream())); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemset(dst.data(), 0, - dst.size() * sizeof(typename View::value_type))); - } }; } // namespace Impl diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 6bc6485c769..edff1eb8b94 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -1360,7 +1360,7 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - ZeroMemset>(exec_space, dst, value); + ZeroMemset(exec_space, dst, value); else contiguous_fill(exec_space, dst, value); } @@ -1386,15 +1386,16 @@ contiguous_fill_or_memset( typename ViewTraits::const_value_type& value) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; + exec_space_type exec; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset>(dst, value); + ZeroMemset(exec, dst, value); else #endif - contiguous_fill(exec_space_type(), dst, value); + contiguous_fill(exec, dst, value); } template diff --git a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 9548f211d9e..9413ae10638 100644 --- a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -35,12 +35,6 @@ struct ZeroMemset> { ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); #endif } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); - } }; } // namespace Impl diff --git a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 3ec2dfbcfa0..7d940186167 100644 --- a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,14 +35,12 @@ template struct ZeroMemset< std::conditional_t::value, Serial, DummyExecutionSpace>, - View> - : public ZeroMemset> { - using Base = ZeroMemset>; - using Base::Base; - + View> { ZeroMemset(const Serial&, const View& dst, - typename View::const_value_type& value) - : Base(dst, value) {} + typename View::const_value_type&) { + using ValueType = typename View::value_type; + std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); + } }; } // namespace Impl diff --git a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp index f740c408fb8..af70ba7df79 100644 --- a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp +++ b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -36,12 +36,6 @@ struct ZeroMemset> { using ValueType = typename View::value_type; std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } }; } // end namespace Impl diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 6a8ac52fc3c..90cbca8e03b 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -2917,9 +2917,7 @@ struct ViewValueFunctor { "Kokkos::View::initialization [" + name + "] via memset", Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( + (void)ZeroMemset( space, Kokkos::View>(ptr, n), @@ -3051,9 +3049,7 @@ struct ViewValueFunctor { Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( + (void)ZeroMemset( space, Kokkos::View>(ptr, n), From 4c94f089bd636dbcec5cee6e41d7c131ce3b79cc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 31 Jan 2024 21:32:36 -0500 Subject: [PATCH 255/432] Get rid of `ZeroMemset`'s silly trailing value argument (#6769) * ZeroMemset does not want that trailing value argument * Prefer C array of `unsigned char` for the zero-initiliazed storage --- core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp | 4 ++-- core/src/HIP/Kokkos_HIP_ZeroMemset.hpp | 3 +-- core/src/Kokkos_CopyViews.hpp | 17 ++++++++--------- core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp | 3 +-- core/src/Serial/Kokkos_Serial_ZeroMemset.hpp | 3 +-- core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 3 +-- core/src/impl/Kokkos_ViewMapping.hpp | 12 ++++-------- 7 files changed, 18 insertions(+), 27 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index fc257fbe728..517c592af72 100644 --- a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -25,8 +25,8 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, + const View& dst) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() ->cuda_memset_async_wrapper( diff --git a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index ac5ff463cfa..4bca29868f7 100644 --- a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -25,8 +25,7 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HIP& exec_space, const View& dst) { KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( dst.data(), 0, dst.size() * sizeof(typename View::value_type), exec_space.hip_stream())); diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index edff1eb8b94..9b379a092bd 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -1336,13 +1336,12 @@ inline void contiguous_fill( // Default implementation for execution spaces that don't provide a definition template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst, - typename ViewType::const_value_type& value) { - contiguous_fill(exec_space, dst, value); - } - - ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) { - contiguous_fill(ExecutionSpace(), dst, value); + ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { + using ValueType = typename ViewType::value_type; + alignas(alignof(ValueType)) unsigned char + zero_initialized_storage[sizeof(ValueType)] = {}; + contiguous_fill(exec_space, dst, + *reinterpret_cast(zero_initialized_storage)); } }; @@ -1360,7 +1359,7 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - ZeroMemset(exec_space, dst, value); + ZeroMemset(exec_space, dst); else contiguous_fill(exec_space, dst, value); } @@ -1392,7 +1391,7 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset(exec, dst, value); + ZeroMemset(exec, dst); else #endif contiguous_fill(exec, dst, value); diff --git a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 9413ae10638..61db6b34aac 100644 --- a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst, - typename View::const_value_type&) { + const View& dst) { auto event = exec_space.impl_internal_space_instance()->m_queue->memset( dst.data(), 0, dst.size() * sizeof(typename View::value_type)); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES diff --git a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 7d940186167..6ad6aabc5a7 100644 --- a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -36,8 +36,7 @@ struct ZeroMemset< std::conditional_t::value, Serial, DummyExecutionSpace>, View> { - ZeroMemset(const Serial&, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const Serial&, const View& dst) { using ValueType = typename View::value_type; std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); } diff --git a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp index af70ba7df79..3072e2ce825 100644 --- a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp +++ b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HostSpace::execution_space& exec, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HostSpace::execution_space& exec, const View& dst) { // Host spaces, except for HPX, are synchronous and we need to fence for HPX // since we can't properly enqueue a std::memset otherwise. // We can't use exec.fence() directly since we don't have a full definition diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 90cbca8e03b..708472be9d3 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -2918,10 +2918,8 @@ struct ViewValueFunctor { Kokkos::Profiling::Experimental::device_id(space), &kpID); } (void)ZeroMemset( - space, - Kokkos::View>(ptr, n), - value); + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); @@ -3050,10 +3048,8 @@ struct ViewValueFunctor { } (void)ZeroMemset( - space, - Kokkos::View>(ptr, n), - value); + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); From 917baa6d64fdab01d0731cba0c294436f8597603 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 1 Feb 2024 12:52:20 -0500 Subject: [PATCH 256/432] Fix typo in deprecatation macro used in HIP Co-Authored-By: Damien L-G --- core/src/HIP/Kokkos_HIP.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/HIP/Kokkos_HIP.hpp b/core/src/HIP/Kokkos_HIP.hpp index dd210b3874c..3a88e97ee3d 100644 --- a/core/src/HIP/Kokkos_HIP.hpp +++ b/core/src/HIP/Kokkos_HIP.hpp @@ -57,7 +57,7 @@ class HIP { //! \name Functions that all Kokkos devices must implement. //@{ -#ifdef KOKKOS_ENABLE_DEPRECATED_4 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__HIP_DEVICE_COMPILE__) return true; From 7d2ea72128eb8b4142c96fe2709a95d0abc1ab4a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 1 Feb 2024 19:21:55 -0700 Subject: [PATCH 257/432] Cuda multi-GPU support: Make some variables device-specific, update Kokkos::fence (#6753) * Kokkos::fence should fence all devices * Create a couple more variables per device * Don't forget desul::Impl::init_lock_arrays(); * Address reviewer comments * Add {} for std::map initialization Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 51 ++++++++++++---------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 20 ++++----- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 11 +++-- 3 files changed, 44 insertions(+), 38 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 9a57eb9398c..4b44d681b21 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -135,7 +135,6 @@ Kokkos::View cuda_global_unique_token_locks( return locks; } -// FIXME_CUDA_MULTIPLE_DEVICES void cuda_device_synchronize(const std::string &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event( name, @@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) { #if defined(KOKKOS_COMPILER_CLANG) // annotate with __host__ silence a clang warning about using // cudaDeviceSynchronize in device code - [] __host__() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + [] __host__() #else - []() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + []() #endif + { + for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + }); } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, @@ -278,6 +277,18 @@ void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); m_stream = stream; + CudaInternal::cuda_devices.insert(m_cudaDev); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -600,27 +611,21 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME - // Allocate a staging buffer for constant mem in pinned host memory and an - // event to avoid overwriting driver for previous kernel launches - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMallocHost( - reinterpret_cast(&Impl::CudaInternal::constantMemHostStaging), - Impl::CudaTraits::ConstantMemoryUsage)); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaEventCreate(&Impl::CudaInternal::constantMemReusable)); - Impl::CudaInternal::singleton().initialize(singleton_stream); } void Cuda::impl_finalize() { (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaEventDestroy(Impl::CudaInternal::constantMemReusable)); - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaFreeHost(Impl::CudaInternal::constantMemHostStaging)); + for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice + [cuda_device])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy( + Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); + } auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); if (deep_copy_space) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index db764fb1b9a..ce0267852a2 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -23,6 +23,10 @@ #include #include #include "Kokkos_CudaSpace.hpp" + +#include +#include + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // These functions fulfill the purpose of allowing to work around @@ -116,11 +120,11 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; - // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' - // here will break once there are multiple devices though - inline static unsigned long* constantMemHostStaging = nullptr; - inline static cudaEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + inline static std::set cuda_devices = {}; + inline static std::map constantMemHostStagingPerDevice = + {}; + inline static std::map constantMemReusablePerDevice = {}; + inline static std::map constantMemMutexPerDevice = {}; static CudaInternal& singleton(); @@ -221,12 +225,6 @@ class CudaInternal { return cudaDeviceSetLimit(limit, value); } - template - cudaError_t cuda_device_synchronize_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaDeviceSynchronize(); - } - template cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const { if constexpr (setCudaDevice) set_cuda_device(); diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 0dbe40d8942..11de1d668e2 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker< static void invoke_kernel(DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, CudaInternal const* cuda_instance) { + int cuda_device = cuda_instance->m_cudaDev; // Wait until the previous kernel that uses the constant buffer is done - std::lock_guard lock(CudaInternal::constantMemMutex); + std::lock_guard lock( + CudaInternal::constantMemMutexPerDevice[cuda_device]); KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; + unsigned long* staging = + cuda_instance->constantMemHostStagingPerDevice[cuda_device]; memcpy(staging, &driver, sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device @@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker< // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); } inline static void create_parallel_launch_graph_node( From 63a1208b3897a4c2d0d8ef00fa4d9b883f210647 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 2 Feb 2024 10:11:45 -0500 Subject: [PATCH 258/432] Fixup use provided execution space when copying host inaccessible reduction result (#6777) * Cuda/HIP use provided execution space when copying reduction result to the host * OpenMPTarget: DeepCopy on the provided execution space in ParallelScanWithTotal * Add trailing execution space template parameter that was missing --- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 3 ++- core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 3 ++- .../OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 449de154aed..63b64bf9876 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -872,7 +872,8 @@ class ParallelReduce(m_result_ptr, m_scratch_space, size); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); } } } diff --git a/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp index f2198902e92..609ba28b866 100644 --- a/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp +++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -271,7 +271,8 @@ class ParallelReduce(m_result_ptr, m_scratch_space, size); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); } } } else { diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 1d6677a1df6..c1f7851f413 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -238,8 +238,10 @@ class ParallelScanWithTotal, if (!base_t::m_result_ptr_device_accessible) { const int size = base_t::m_functor_reducer.get_reducer().value_size(); - DeepCopy( - base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); + DeepCopy( + base_t::m_policy.space(), base_t::m_result_ptr, + chunk_values.data() + (n_chunks - 1), size); } } else if (!base_t::m_result_ptr_device_accessible) { base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr); From b4bc4061426e57f2cae17a932464ceb5203e6fc2 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Fri, 2 Feb 2024 14:42:03 -0500 Subject: [PATCH 259/432] Reenable TestHIP_Memory_Requirements --- core/unit_test/hip/TestHIP_Memory_Requirements.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp index a213453ea18..8c72e9f2972 100644 --- a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp +++ b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp @@ -48,9 +48,6 @@ TEST(hip, memory_requirements) { // we want all user-facing memory in hip to be coarse grained. As of // today(07.01.22) the documentation is not reliable/correct, we test the // memory on the device and host - // FIXME_HIP - GTEST_SKIP() << "skipping the test because the CI on MI100 returns: error( " - "hipErrorInvalidValue)"; KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10); From 20d52fb1c4cf6adc8a1dc4a589087a29b5c884e2 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 2 Feb 2024 19:07:33 +0000 Subject: [PATCH 260/432] Fix Occupancy for Cuda --- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 6 ++-- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 2 +- .../TestCommonPolicyConstructors.hpp | 36 +++++++++++++++---- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index ce0267852a2..24f4af31019 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -447,10 +447,10 @@ class CudaInternal { } template - cudaError_t cuda_func_set_attributes_wrapper(T* entry, cudaFuncAttribute attr, - int value) const { + cudaError_t cuda_func_set_attribute_wrapper(T* entry, cudaFuncAttribute attr, + int value) const { if constexpr (setCudaDevice) set_cuda_device(); - return cudaFuncSetAttributes(entry, attr, value); + return cudaFuncSetAttribute(entry, attr, value); } template diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 11de1d668e2..fbdfc149011 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -222,7 +222,7 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, // FIXME_CUDA_MULTIPLE_DEVICES auto set_cache_config = [&] { KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_set_attributes_wrapper( + (CudaInternal::singleton().cuda_func_set_attribute_wrapper( func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout))); return carveout; }; diff --git a/core/unit_test/TestCommonPolicyConstructors.hpp b/core/unit_test/TestCommonPolicyConstructors.hpp index ec5d1ae0f46..966fb065395 100644 --- a/core/unit_test/TestCommonPolicyConstructors.hpp +++ b/core/unit_test/TestCommonPolicyConstructors.hpp @@ -45,10 +45,26 @@ static_assert(check_semiregular>()); static_assert(check_semiregular>>()); // Assert that occupancy conversion and hints work properly. -template -void test_prefer_desired_occupancy() { - Policy policy; +template +void test_policy_execution(const Kokkos::RangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); +} +template +void test_policy_execution(const Kokkos::TeamPolicy& policy) { + Kokkos::parallel_for( + policy, + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy::member_type&){}); +} +template +void test_policy_execution(const Kokkos::MDRangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); +} +template +void test_policy_execution(const DummyPolicy&) {} +template +void test_prefer_desired_occupancy(Policy policy) { using Kokkos::Experimental::DesiredOccupancy; using Kokkos::Experimental::MaximizeOccupancy; using Kokkos::Experimental::prefer; @@ -60,6 +76,7 @@ void test_prefer_desired_occupancy() { auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{}); static_assert( !decltype(policy_still_no_occ)::experimental_contains_desired_occupancy); + test_policy_execution(policy_still_no_occ); // MaximizeOccupancy -> DesiredOccupancy auto const policy_with_occ = @@ -67,31 +84,36 @@ void test_prefer_desired_occupancy() { static_assert( decltype(policy_with_occ)::experimental_contains_desired_occupancy); EXPECT_EQ(policy_with_occ.impl_get_desired_occupancy().value(), 33); + test_policy_execution(policy_with_occ); // DesiredOccupancy -> DesiredOccupancy auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); static_assert( decltype(policy_change_occ)::experimental_contains_desired_occupancy); EXPECT_EQ(policy_change_occ.impl_get_desired_occupancy().value(), 24); + test_policy_execution(policy_change_occ); // DesiredOccupancy -> DesiredOccupancy w/ hint auto policy_with_occ_and_hint = Kokkos::Experimental::require( policy_change_occ, Kokkos::Experimental::WorkItemProperty::HintLightWeight); EXPECT_EQ(policy_with_occ_and_hint.impl_get_desired_occupancy().value(), 24); + test_policy_execution(policy_with_occ_and_hint); // DesiredOccupancy -> MaximizeOccupancy auto const policy_drop_occ = prefer(policy_with_occ_and_hint, MaximizeOccupancy{}); static_assert( !decltype(policy_drop_occ)::experimental_contains_desired_occupancy); + test_policy_execution(policy_drop_occ); } TEST(TEST_CATEGORY, execution_policy_occupancy_and_hint) { - test_prefer_desired_occupancy>(); - test_prefer_desired_occupancy>(); - test_prefer_desired_occupancy>(); - test_prefer_desired_occupancy>>(); + test_prefer_desired_occupancy(DummyPolicy<>{}); + test_prefer_desired_occupancy(Kokkos::RangePolicy<>(0, 0)); + test_prefer_desired_occupancy(Kokkos::TeamPolicy<>{0, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); } // Check that the policy size does not increase if the user does not specify the From 97997807de7f592a0845f96ee0adb4644a57efb2 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Mon, 5 Feb 2024 11:13:13 -0700 Subject: [PATCH 261/432] Temporarily disable simd_reduction test for omptarget build --- simd/unit_tests/include/TestSIMD_Reductions.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/simd/unit_tests/include/TestSIMD_Reductions.hpp b/simd/unit_tests/include/TestSIMD_Reductions.hpp index b1aef98c2a8..b3c7ac9a01e 100644 --- a/simd/unit_tests/include/TestSIMD_Reductions.hpp +++ b/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -172,6 +172,12 @@ TEST(simd, host_reductions) { } TEST(simd, device_reductions) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif Kokkos::parallel_for(1, simd_device_reduction_functor()); } From 01d5f8149da38abda68698421bb4aacd788c3d16 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 6 Feb 2024 07:40:14 -0700 Subject: [PATCH 262/432] SYCL: Error out on initialization if the backend is different from ext_oneapi_* (#6784) * SYCL: Error out on initialization if the backend is different from ext_oneapi_* * Use a macro instead. Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index f05deab54b0..0e67adb5787 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -102,6 +102,23 @@ void SYCLInternal::initialize(const sycl::device& d) { void SYCLInternal::initialize(const sycl::queue& q) { KOKKOS_EXPECTS(!is_initialized()); +#define KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(BACKEND, REQUIRED) \ + if (BACKEND != REQUIRED) \ + Kokkos::abort( \ + "The SYCL execution space instance was initialized with an " \ + "unsupported backend type! For this GPU architecture, only " #REQUIRED \ + " is supported.") +#if defined(KOKKOS_ARCH_INTEL_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_level_zero); +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_cuda); +#elif defined(KOKKOS_ARCH_AMD_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_hip); +#endif + if (was_finalized) Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n"); From 31fb4761de109335999bf97cc9da89a8c6450a09 Mon Sep 17 00:00:00 2001 From: Dan Ibanez Date: Tue, 6 Feb 2024 15:11:46 +0000 Subject: [PATCH 263/432] simd: support vector_aligned_tag (#6243) * support vector_aligned_tag this SIMD load/store tag is defined in the ISO C++ TS and specifies that the load or store operation is being done with a pointer aligned to vector width. The Intel backends (AVX2, AVX512) do have different intrinsics in this case, and this enhancement allows users of Kokkos SIMD to make use of those intrinsics. * Made aligned_tags into aliases of corresponding loadstore_flags * clang-formatted * Replaced loadstore_flags with simd_flags * clang-formatted --------- Co-authored-by: Dong Hun Lee --- simd/src/Kokkos_SIMD_AVX2.hpp | 124 ++++++++++++++++ simd/src/Kokkos_SIMD_AVX512.hpp | 140 ++++++++++++++++-- simd/src/Kokkos_SIMD_Common.hpp | 11 +- simd/src/Kokkos_SIMD_NEON.hpp | 78 ++++++++++ simd/src/Kokkos_SIMD_Scalar.hpp | 15 ++ .../include/SIMDTesting_Utilities.hpp | 29 +++- .../include/TestSIMD_GeneratorCtors.hpp | 8 +- simd/unit_tests/include/TestSIMD_MathOps.hpp | 24 ++- simd/unit_tests/include/TestSIMD_ShiftOps.hpp | 20 ++- .../include/TestSIMD_WhereExpressions.hpp | 8 +- 10 files changed, 419 insertions(+), 38 deletions(-) diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 14eefe5fe20..6d0956f3832 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -565,10 +565,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256d() const { return m_value; @@ -820,10 +828,18 @@ class simd> { element_aligned_tag) { m_value = _mm_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128() const { return m_value; @@ -1067,12 +1083,25 @@ class simd> { m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); #else m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() const { return m_value; @@ -1257,6 +1286,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( @@ -1264,6 +1302,11 @@ class simd> { _mm256_maskstore_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; @@ -1461,6 +1504,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() @@ -1613,6 +1665,11 @@ class const_where_expression>, static_cast<__m256d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)), + static_cast<__m256d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1649,6 +1706,11 @@ class where_expression>, mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_maskload_pd( + mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1692,6 +1754,11 @@ class const_where_expression>, static_cast<__m128>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm_maskstore_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)), + static_cast<__m128>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1728,6 +1795,11 @@ class where_expression>, _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type( + _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, simd> const& index) { @@ -1771,6 +1843,12 @@ class const_where_expression< _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), static_cast<__m128i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), + static_cast<__m128i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1811,6 +1889,16 @@ class where_expression>, m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m128i tmp = _mm_load_si128(reinterpret_cast<__m128i const*>(mem)); + m_value = value_type(_mm_and_si128(tmp, static_cast<__m128i>(m_mask))); +#else + m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1858,6 +1946,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::int64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1899,6 +1994,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1947,6 +2053,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::uint64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1988,6 +2101,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index c5d1717ad4e..7fa35c204ae 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -193,10 +193,18 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d() const { return m_value; @@ -475,10 +483,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; @@ -735,15 +751,25 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { @@ -934,21 +960,30 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), @@ -1130,10 +1165,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1331,10 +1375,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1505,6 +1558,11 @@ class const_where_expression>, static_cast<__m512d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm512_mask_store_pd(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1541,6 +1599,11 @@ class where_expression>, _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_pd( + _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1584,6 +1647,11 @@ class const_where_expression>, static_cast<__m256>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm256_mask_store_ps(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1619,6 +1687,10 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_ps( _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_ps( + _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1666,6 +1738,12 @@ class const_where_expression< _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1702,6 +1780,11 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1710,6 +1793,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint32_t* mem, @@ -1784,6 +1874,12 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint32_t const* mem, @@ -1792,6 +1888,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1866,6 +1969,12 @@ class where_expression>, m_value = value_type(_mm512_mask_loadu_epi64( _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1874,6 +1983,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1949,6 +2065,11 @@ class where_expression>, _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, simd> const& index) { @@ -1956,6 +2077,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template class simd_mask; -struct element_aligned_tag {}; +class simd_alignment_vector_aligned {}; + +template +struct simd_flags {}; + +inline constexpr simd_flags<> simd_flag_default{}; +inline constexpr simd_flags simd_flag_aligned{}; + +using element_aligned_tag = simd_flags<>; +using vector_aligned_tag = simd_flags; // class template declarations for const_where_expression and where_expression diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 61d506eac86..815e193d049 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -363,10 +363,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_f64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_f64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_f64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_f64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float64x2_t() const { return m_value; @@ -844,10 +852,18 @@ class simd> { element_aligned_tag) { m_value = vld1_s32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_s32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_s32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_s32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t() const { return m_value; @@ -1048,10 +1064,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_s64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_s64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_s64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_s64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int64x2_t() const { return m_value; @@ -1253,6 +1277,11 @@ class simd> { element_aligned_tag) { m_value = vld1q_u64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_u64(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() const { return m_value; @@ -1396,6 +1425,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1431,6 +1465,11 @@ class where_expression>, if (m_mask[1]) m_value[1] = mem[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1552,6 +1591,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1587,6 +1632,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1594,6 +1645,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t< @@ -1632,6 +1684,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1667,6 +1725,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1674,6 +1738,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, @@ -1754,6 +1831,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template { element_aligned_tag) { m_value = *ptr; } + KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr, vector_aligned_tag) { + m_value = *ptr; + } KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const { *ptr = m_value; } + KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, vector_aligned_tag) const { + *ptr = m_value; + } + KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) { return m_value; } @@ -308,6 +315,10 @@ class const_where_expression, void copy_to(T* mem, element_aligned_tag) const { if (static_cast(m_mask)) *mem = static_cast(m_value); } + KOKKOS_FORCEINLINE_FUNCTION + void copy_to(T* mem, vector_aligned_tag) const { + if (static_cast(m_mask)) *mem = static_cast(m_value); + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> scatter_to(T* mem, simd const& index) const { @@ -344,6 +355,10 @@ class where_expression, void copy_from(T const* mem, element_aligned_tag) { if (static_cast(this->m_mask)) this->m_value = *mem; } + KOKKOS_FORCEINLINE_FUNCTION + void copy_from(T const* mem, vector_aligned_tag) { + if (static_cast(this->m_mask)) this->m_value = *mem; + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> gather_from(T const* mem, simd const& index) { diff --git a/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/simd/unit_tests/include/SIMDTesting_Utilities.hpp index ae2ab2c697c..d36e1e5afc5 100644 --- a/simd/unit_tests/include/SIMDTesting_Utilities.hpp +++ b/simd/unit_tests/include/SIMDTesting_Utilities.hpp @@ -93,7 +93,7 @@ class load_element_aligned { bool host_load(T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); return true; } template @@ -101,7 +101,26 @@ class load_element_aligned { T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); + return true; + } +}; + +class load_vector_aligned { + public: + template + bool host_load(T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); + return true; + } + template + KOKKOS_INLINE_FUNCTION bool device_load( + T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); return true; } }; @@ -116,8 +135,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = 0; return true; } @@ -130,8 +148,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = T(0); return true; } diff --git a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 4af08c266bb..23e3826c752 100644 --- a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -37,10 +37,10 @@ inline void host_check_gen_ctor() { } simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if constexpr (std::is_same_v) { @@ -98,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); device_check_equality(basic, rhs, lanes); simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); @@ -106,7 +106,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); device_check_equality(result, blend, lanes); } diff --git a/simd/unit_tests/include/TestSIMD_MathOps.hpp b/simd/unit_tests/include/TestSIMD_MathOps.hpp index 6f8a8aa0f29..59f2f6c18fd 100644 --- a/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -83,6 +83,7 @@ inline void host_check_math_op_all_loaders(Op op, std::size_t n, host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); + host_check_math_op_one_loader(op, n, args...); } template @@ -121,23 +122,29 @@ inline void host_check_abi_size() { template inline void host_check_math_ops() { constexpr size_t n = 11; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); host_check_abi_size(); if constexpr (!std::is_integral_v) { - DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, - -2.0, 10.0, 0.0, 1.2, -2.8}; - DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, - -2.4, 1.0, 13.0, -3.2, -2.1}; + alignas(alignment) DataType const first_args[n] = { + 0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, 0.0, 1.2, -2.8}; + alignas(alignment) DataType const second_args[n] = { + 1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, 13.0, -3.2, -2.1}; host_check_all_math_ops(first_args, second_args); } else { if constexpr (std::is_signed_v) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + alignas(alignment) DataType const second_args[n] = {1, 2, 1, 1, 1, -3, + -2, 1, 13, -3, -2}; host_check_all_math_ops(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + alignas(alignment) + DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; host_check_all_math_ops(first_args, second_args); } } @@ -214,6 +221,7 @@ KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op, device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); + device_check_math_op_one_loader(op, n, args...); } template diff --git a/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/simd/unit_tests/include/TestSIMD_ShiftOps.hpp index f6fdcb920ed..ffdd2cba4a0 100644 --- a/simd/unit_tests/include/TestSIMD_ShiftOps.hpp +++ b/simd/unit_tests/include/TestSIMD_ShiftOps.hpp @@ -85,10 +85,11 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by, n); host_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + host_check_shift_on_one_loader(shift_op, test_vals, + shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -96,6 +97,8 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by_lanes); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + host_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template @@ -104,12 +107,14 @@ inline void host_check_shift_ops() { using simd_type = Kokkos::Experimental::simd; constexpr std::size_t width = simd_type::size(); constexpr std::size_t num_cases = 8; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); DataType max = std::numeric_limits::max(); - DataType shift_by[num_cases] = { + alignas(alignment) DataType shift_by[num_cases] = { 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - DataType test_vals[width]; + alignas(alignment) DataType test_vals[width]; for (std::size_t i = 0; i < width; ++i) { DataType inc = max / width; test_vals[i] = i * inc + 1; @@ -201,10 +206,11 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_by, n); device_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + device_check_shift_on_one_loader( + shift_op, test_vals, shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -212,6 +218,8 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_op, test_vals, shift_by_lanes); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + device_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template diff --git a/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp index 129f2b0d5c9..152fd9e9840 100644 --- a/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp +++ b/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp @@ -29,7 +29,7 @@ inline void host_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -46,7 +46,7 @@ inline void host_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); host_check_equality(expected_result, dst_simd, nlanes); } @@ -107,7 +107,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -124,7 +124,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); device_check_equality(expected_result, dst_simd, nlanes); } From 4d29e39ab8c1dcb89006b5ba623969c506e36081 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 6 Feb 2024 11:42:36 -0500 Subject: [PATCH 264/432] Disable test for MSVC+Cuda --- core/unit_test/TestCommonPolicyConstructors.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/unit_test/TestCommonPolicyConstructors.hpp b/core/unit_test/TestCommonPolicyConstructors.hpp index 966fb065395..f96e39b7dc8 100644 --- a/core/unit_test/TestCommonPolicyConstructors.hpp +++ b/core/unit_test/TestCommonPolicyConstructors.hpp @@ -108,6 +108,9 @@ void test_prefer_desired_occupancy(Policy policy) { test_policy_execution(policy_drop_occ); } +// FIXME_MSVC_WITH_CUDA +// This test doesn't compile with CUDA on Windows +#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) TEST(TEST_CATEGORY, execution_policy_occupancy_and_hint) { test_prefer_desired_occupancy(DummyPolicy<>{}); test_prefer_desired_occupancy(Kokkos::RangePolicy<>(0, 0)); @@ -115,6 +118,7 @@ TEST(TEST_CATEGORY, execution_policy_occupancy_and_hint) { test_prefer_desired_occupancy( Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); } +#endif // Check that the policy size does not increase if the user does not specify the // occupancy (only pay for what you use). From 442e4d42ad53720d3e724052c995667bc7b77787 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Tue, 6 Feb 2024 20:56:57 -0700 Subject: [PATCH 265/432] Add checks for unsafe implicit conversions in RangePolicy (#6754) * Added a check for unintended implicit conversion in RangePolicy * Changed to integrate Rangepolicy implicit checks in existing constructors instead * Changed ifdef conditions to allow outputting warning messages even when deprecated code is used Modified the unit test to test warning outputs Changed implicit conversion check to be tested in debug mode only * Fixed incorrect gtest_skip call and unused var warning * Removed ifdef kokkos_enable_debug guards * Switch to use the new interface --- core/src/Kokkos_ExecPolicy.hpp | 75 +++++++++++++++++-- .../unit_test/TestRangePolicyConstructors.hpp | 75 +++++++++++++++++++ 2 files changed, 144 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index 979db33a3e1..343af5bd690 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include #include +#include //---------------------------------------------------------------------------- @@ -114,39 +115,57 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask(0) {} /** \brief Total range */ + template && + std::is_convertible_v), + bool> = false> inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end) + const IndexType1 work_begin, const IndexType2 work_end) : m_space(work_space), m_begin(work_begin), m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_conversion_safety(work_begin); + check_conversion_safety(work_end); check_bounds_validity(); set_auto_chunk_size(); } /** \brief Total range */ - inline RangePolicy(const member_type work_begin, const member_type work_end) + template && + std::is_convertible_v), + bool> = false> + inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end) : RangePolicy(typename traits::execution_space(), work_begin, work_end) {} /** \brief Total range */ - template + template && + std::is_convertible_v), + bool> = false> inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end, + const IndexType1 work_begin, const IndexType2 work_end, Args... args) : m_space(work_space), m_begin(work_begin), m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_conversion_safety(work_begin); + check_conversion_safety(work_end); check_bounds_validity(); set_auto_chunk_size(); set(args...); } /** \brief Total range */ - template - inline RangePolicy(const member_type work_begin, const member_type work_end, + template && + std::is_convertible_v), + bool> = false> + inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, Args... args) : RangePolicy(typename traits::execution_space(), work_begin, work_end, args...) {} @@ -233,6 +252,50 @@ class RangePolicy : public Impl::PolicyTraits { } } + // To be replaced with std::in_range (c++20) + template + static void check_conversion_safety(const IndexType bound) { +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ + defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_signed_v != + std::is_signed_v) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } + + // check narrowing + warn |= (static_cast(static_cast(bound)) != bound); + + if (warn) { +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } +#else + (void)bound; +#endif + } + public: /** \brief Subrange for a partition's rank and size. * diff --git a/core/unit_test/TestRangePolicyConstructors.hpp b/core/unit_test/TestRangePolicyConstructors.hpp index 88656813ff4..c8c1542af13 100644 --- a/core/unit_test/TestRangePolicyConstructors.hpp +++ b/core/unit_test/TestRangePolicyConstructors.hpp @@ -19,6 +19,7 @@ #include #include +#include namespace { @@ -121,4 +122,78 @@ TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) { #endif } +TEST(TEST_CATEGORY_DEATH, range_policy_implicitly_converted_bounds) { + using UIntIndexType = Kokkos::IndexType; + using IntIndexType = Kokkos::IndexType; + using UIntPolicy = Kokkos::RangePolicy; + using IntPolicy = Kokkos::RangePolicy; + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is " + "performed on a bound (), which may not preserve its original value.\n"; + + auto get_error_msg = [](auto str, auto val) { + return str.insert(str.find("(") + 1, std::to_string(val).c_str()); + }; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10); }, + get_error_msg(expected, test_val)); + } + { + unsigned test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0u, test_val); }, + get_error_msg(expected, test_val)); + } + { + long long test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0LL, test_val); }, + get_error_msg(expected, test_val)); + } + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10, Kokkos::ChunkSize(2)); }, + get_error_msg(expected, test_val)); + } + +#else + { + ::testing::internal::CaptureStderr(); + int test_val = -1; + UIntPolicy policy(test_val, 10); + ASSERT_EQ(policy.begin(), 0u); + ASSERT_EQ(policy.end(), 0u); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } + { + ::testing::internal::CaptureStderr(); + unsigned test_val = std::numeric_limits::max(); + IntPolicy policy(0u, test_val); + ASSERT_EQ(policy.begin(), 0); + ASSERT_EQ(policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } +#endif +} + } // namespace From 91cc45e3a29162a7eb5ce59a7b5d52619ee8dfde Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 6 Feb 2024 15:46:18 -0500 Subject: [PATCH 266/432] Split runtime checks from TestCommonPolicyConstructors into OccupancyControlTrait --- core/unit_test/CMakeLists.txt | 1 + .../TestCommonPolicyConstructors.hpp | 45 ++------- core/unit_test/TestOccupancyControlTrait.hpp | 91 +++++++++++++++++++ 3 files changed, 100 insertions(+), 37 deletions(-) create mode 100644 core/unit_test/TestOccupancyControlTrait.hpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index f5985b5ff89..28514d2d7c1 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -184,6 +184,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDSpan MinMaxClamp NumericTraits + OccupancyControlTrait Other ParallelScanRangePolicy Printf diff --git a/core/unit_test/TestCommonPolicyConstructors.hpp b/core/unit_test/TestCommonPolicyConstructors.hpp index f96e39b7dc8..a72cb38e5e0 100644 --- a/core/unit_test/TestCommonPolicyConstructors.hpp +++ b/core/unit_test/TestCommonPolicyConstructors.hpp @@ -45,26 +45,10 @@ static_assert(check_semiregular>()); static_assert(check_semiregular>>()); // Assert that occupancy conversion and hints work properly. -template -void test_policy_execution(const Kokkos::RangePolicy& policy) { - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); -} -template -void test_policy_execution(const Kokkos::TeamPolicy& policy) { - Kokkos::parallel_for( - policy, - KOKKOS_LAMBDA( - const typename Kokkos::TeamPolicy::member_type&){}); -} -template -void test_policy_execution(const Kokkos::MDRangePolicy& policy) { - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); -} -template -void test_policy_execution(const DummyPolicy&) {} - template -void test_prefer_desired_occupancy(Policy policy) { +void test_prefer_desired_occupancy() { + Policy policy; + using Kokkos::Experimental::DesiredOccupancy; using Kokkos::Experimental::MaximizeOccupancy; using Kokkos::Experimental::prefer; @@ -76,49 +60,36 @@ void test_prefer_desired_occupancy(Policy policy) { auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{}); static_assert( !decltype(policy_still_no_occ)::experimental_contains_desired_occupancy); - test_policy_execution(policy_still_no_occ); // MaximizeOccupancy -> DesiredOccupancy auto const policy_with_occ = prefer(policy_still_no_occ, DesiredOccupancy{33}); static_assert( decltype(policy_with_occ)::experimental_contains_desired_occupancy); - EXPECT_EQ(policy_with_occ.impl_get_desired_occupancy().value(), 33); - test_policy_execution(policy_with_occ); // DesiredOccupancy -> DesiredOccupancy auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); static_assert( decltype(policy_change_occ)::experimental_contains_desired_occupancy); - EXPECT_EQ(policy_change_occ.impl_get_desired_occupancy().value(), 24); - test_policy_execution(policy_change_occ); // DesiredOccupancy -> DesiredOccupancy w/ hint auto policy_with_occ_and_hint = Kokkos::Experimental::require( policy_change_occ, Kokkos::Experimental::WorkItemProperty::HintLightWeight); - EXPECT_EQ(policy_with_occ_and_hint.impl_get_desired_occupancy().value(), 24); - test_policy_execution(policy_with_occ_and_hint); // DesiredOccupancy -> MaximizeOccupancy auto const policy_drop_occ = prefer(policy_with_occ_and_hint, MaximizeOccupancy{}); static_assert( !decltype(policy_drop_occ)::experimental_contains_desired_occupancy); - test_policy_execution(policy_drop_occ); } -// FIXME_MSVC_WITH_CUDA -// This test doesn't compile with CUDA on Windows -#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) -TEST(TEST_CATEGORY, execution_policy_occupancy_and_hint) { - test_prefer_desired_occupancy(DummyPolicy<>{}); - test_prefer_desired_occupancy(Kokkos::RangePolicy<>(0, 0)); - test_prefer_desired_occupancy(Kokkos::TeamPolicy<>{0, Kokkos::AUTO}); - test_prefer_desired_occupancy( - Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); +void test_execution_policy_occupancy_and_hint() { + test_prefer_desired_occupancy>(); + test_prefer_desired_occupancy>(); + test_prefer_desired_occupancy>(); + test_prefer_desired_occupancy>>(); } -#endif // Check that the policy size does not increase if the user does not specify the // occupancy (only pay for what you use). diff --git a/core/unit_test/TestOccupancyControlTrait.hpp b/core/unit_test/TestOccupancyControlTrait.hpp new file mode 100644 index 00000000000..e91066d7bae --- /dev/null +++ b/core/unit_test/TestOccupancyControlTrait.hpp @@ -0,0 +1,91 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +// Dummy policy for testing base class. +template +struct DummyPolicy : Kokkos::Impl::PolicyTraits { + using execution_policy = DummyPolicy; +}; + +template +void test_policy_execution(const Kokkos::RangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); +} +template +void test_policy_execution(const Kokkos::TeamPolicy& policy) { + Kokkos::parallel_for( + policy, + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy::member_type&){}); +} +template +void test_policy_execution(const Kokkos::MDRangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); +} +template +void test_policy_execution(const DummyPolicy&) {} + +template +void test_prefer_desired_occupancy(Policy policy) { + using Kokkos::Experimental::DesiredOccupancy; + using Kokkos::Experimental::MaximizeOccupancy; + using Kokkos::Experimental::prefer; + using Kokkos::Experimental::WorkItemProperty; + + // MaximizeOccupancy -> MaximizeOccupancy + auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{}); + test_policy_execution(policy_still_no_occ); + + // MaximizeOccupancy -> DesiredOccupancy + auto const policy_with_occ = + prefer(policy_still_no_occ, DesiredOccupancy{33}); + EXPECT_EQ(policy_with_occ.impl_get_desired_occupancy().value(), 33); + test_policy_execution(policy_with_occ); + + // DesiredOccupancy -> DesiredOccupancy + auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); + EXPECT_EQ(policy_change_occ.impl_get_desired_occupancy().value(), 24); + test_policy_execution(policy_change_occ); + + // DesiredOccupancy -> DesiredOccupancy w/ hint + auto policy_with_occ_and_hint = Kokkos::Experimental::require( + policy_change_occ, + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + EXPECT_EQ(policy_with_occ_and_hint.impl_get_desired_occupancy().value(), 24); + test_policy_execution(policy_with_occ_and_hint); + + // DesiredOccupancy -> MaximizeOccupancy + auto const policy_drop_occ = + prefer(policy_with_occ_and_hint, MaximizeOccupancy{}); + test_policy_execution(policy_drop_occ); +} + +// FIXME_MSVC_WITH_CUDA +// This test doesn't compile with CUDA on Windows +#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) +TEST(TEST_CATEGORY, occupancy_control) { + test_prefer_desired_occupancy(DummyPolicy<>{}); + test_prefer_desired_occupancy(Kokkos::RangePolicy<>(0, 0)); + test_prefer_desired_occupancy(Kokkos::TeamPolicy<>{0, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); +} +#endif +} // namespace From 99b2e46ec33a23ba126149495a2e77a4a16bba05 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 7 Feb 2024 08:19:25 -0500 Subject: [PATCH 267/432] Run OccupancyControlTrait on all execution spaces --- core/unit_test/TestOccupancyControlTrait.hpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/core/unit_test/TestOccupancyControlTrait.hpp b/core/unit_test/TestOccupancyControlTrait.hpp index e91066d7bae..6c6913e8a7c 100644 --- a/core/unit_test/TestOccupancyControlTrait.hpp +++ b/core/unit_test/TestOccupancyControlTrait.hpp @@ -77,15 +77,19 @@ void test_prefer_desired_occupancy(Policy policy) { test_policy_execution(policy_drop_occ); } +TEST(TEST_CATEGORY, occupancy_control) { // FIXME_MSVC_WITH_CUDA // This test doesn't compile with CUDA on Windows -#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) -TEST(TEST_CATEGORY, occupancy_control) { - test_prefer_desired_occupancy(DummyPolicy<>{}); - test_prefer_desired_occupancy(Kokkos::RangePolicy<>(0, 0)); - test_prefer_desired_occupancy(Kokkos::TeamPolicy<>{0, Kokkos::AUTO}); - test_prefer_desired_occupancy( - Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); -} +#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) + if constexpr (!std::is_same_v) #endif + { + test_prefer_desired_occupancy(DummyPolicy{}); + test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 0)); + test_prefer_desired_occupancy( + Kokkos::TeamPolicy{0, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); + } +} } // namespace From 19dcd64daaaa6a50f8ff4e014ae418de0ea7ad0d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 7 Feb 2024 08:56:04 -0500 Subject: [PATCH 268/432] test_execution_policy_occupancy_and_hint might be unused --- core/unit_test/TestCommonPolicyConstructors.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestCommonPolicyConstructors.hpp b/core/unit_test/TestCommonPolicyConstructors.hpp index a72cb38e5e0..8ec348e47c9 100644 --- a/core/unit_test/TestCommonPolicyConstructors.hpp +++ b/core/unit_test/TestCommonPolicyConstructors.hpp @@ -84,7 +84,7 @@ void test_prefer_desired_occupancy() { !decltype(policy_drop_occ)::experimental_contains_desired_occupancy); } -void test_execution_policy_occupancy_and_hint() { +[[maybe_unused]] void test_execution_policy_occupancy_and_hint() { test_prefer_desired_occupancy>(); test_prefer_desired_occupancy>(); test_prefer_desired_occupancy>(); From 26060fed7e71f37d9713e028e051cc70c84f1800 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 7 Feb 2024 12:28:58 -0500 Subject: [PATCH 269/432] Don't try to compile the test for any backend with MSVC+Cuda --- core/unit_test/TestOccupancyControlTrait.hpp | 21 +++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/core/unit_test/TestOccupancyControlTrait.hpp b/core/unit_test/TestOccupancyControlTrait.hpp index 6c6913e8a7c..ec993f3375e 100644 --- a/core/unit_test/TestOccupancyControlTrait.hpp +++ b/core/unit_test/TestOccupancyControlTrait.hpp @@ -77,19 +77,16 @@ void test_prefer_desired_occupancy(Policy policy) { test_policy_execution(policy_drop_occ); } -TEST(TEST_CATEGORY, occupancy_control) { // FIXME_MSVC_WITH_CUDA // This test doesn't compile with CUDA on Windows -#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) - if constexpr (!std::is_same_v) -#endif - { - test_prefer_desired_occupancy(DummyPolicy{}); - test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 0)); - test_prefer_desired_occupancy( - Kokkos::TeamPolicy{0, Kokkos::AUTO}); - test_prefer_desired_occupancy( - Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); - } +#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) +TEST(TEST_CATEGORY, occupancy_control) { + test_prefer_desired_occupancy(DummyPolicy{}); + test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 0)); + test_prefer_desired_occupancy( + Kokkos::TeamPolicy{0, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); } +#endif } // namespace From 1327c37795c3c3338f8c75934d57ac29ced9c6f9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 7 Feb 2024 15:30:50 -0500 Subject: [PATCH 270/432] Let Impl::get_gpu return std::optional and delegate device selection when appropriate --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 2 +- core/src/HIP/Kokkos_HIP.cpp | 2 +- core/src/OpenACC/Kokkos_OpenACC.cpp | 2 +- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp | 2 +- core/src/SYCL/Kokkos_SYCL.cpp | 2 +- core/src/impl/Kokkos_Core.cpp | 9 ++++++--- core/src/impl/Kokkos_DeviceManagement.hpp | 3 ++- 7 files changed, 13 insertions(+), 9 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 4b44d681b21..eec1e9867b4 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -532,7 +532,7 @@ int Cuda::impl_is_initialized() { } void Cuda::impl_initialize(InitializationSettings const &settings) { - const int cuda_device_id = Impl::get_gpu(settings); + const int cuda_device_id = Impl::get_gpu(settings).value_or(0); cudaDeviceProp cudaProp; KOKKOS_IMPL_CUDA_SAFE_CALL( diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index 2a6bfea1d64..4b84ea829a8 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -42,7 +42,7 @@ int HIP::impl_is_initialized() { } void HIP::impl_initialize(InitializationSettings const& settings) { - const int hip_device_id = Impl::get_gpu(settings); + const int hip_device_id = Impl::get_gpu(settings).value_or(0); Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( diff --git a/core/src/OpenACC/Kokkos_OpenACC.cpp b/core/src/OpenACC/Kokkos_OpenACC.cpp index f54c44d66f0..715159d1771 100644 --- a/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -59,7 +59,7 @@ void Kokkos::Experimental::OpenACC::impl_initialize( acc_get_device_num(acc_device_host); } else { using Kokkos::Impl::get_gpu; - int const dev_num = get_gpu(settings); + int const dev_num = get_gpu(settings).value_or(0); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; } diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 9e8844a6f20..dd831185849 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -179,7 +179,7 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { using Kokkos::Impl::get_gpu; - const int device_num = get_gpu(settings); + const int device_num = get_gpu(settings).value_or(0); omp_set_default_device(device_num); Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index af64b6908d4..01da97c6393 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -154,7 +154,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { return; } #endif - const auto id = ::Kokkos::Impl::get_gpu(settings); + const auto id = ::Kokkos::Impl::get_gpu(settings).value_or(0); Impl::SYCLInternal::singleton().initialize(gpu_devices[id]); Impl::SYCLInternal::m_syclDev = id; } diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 0229da88923..e51782f836b 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -16,6 +16,7 @@ #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE +#include #endif #include @@ -377,7 +378,8 @@ std::vector Kokkos::Impl::get_visible_devices(int device_count) { return visible_devices; } -int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { +std::optional Kokkos::Impl::get_gpu( + const InitializationSettings& settings) { std::vector visible_devices = get_visible_devices(get_device_count()); int const num_devices = visible_devices.size(); // device_id is provided @@ -425,14 +427,15 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { int const mpi_local_rank = mpi_local_rank_on_node(); - // use first GPU available for execution if unable to detect local MPI rank + // if unable to detect local MPI rank return nullopt to delegate device + // selection to the backend if (mpi_local_rank < 0) { if (settings.has_map_device_id_by()) { std::cerr << "Warning: unable to detect local MPI rank." << " Falling back to the first GPU available for execution." << " Raised by Kokkos::initialize()." << std::endl; } - return visible_devices[0]; + return std::nullopt; } // use device assigned by CTest when resource allocation is activated diff --git a/core/src/impl/Kokkos_DeviceManagement.hpp b/core/src/impl/Kokkos_DeviceManagement.hpp index 5783cb3d79e..70dca5d8fad 100644 --- a/core/src/impl/Kokkos_DeviceManagement.hpp +++ b/core/src/impl/Kokkos_DeviceManagement.hpp @@ -17,12 +17,13 @@ #ifndef KOKKOS_DEVICE_MANAGEMENT_HPP #define KOKKOS_DEVICE_MANAGEMENT_HPP +#include #include namespace Kokkos { class InitializationSettings; namespace Impl { -int get_gpu(const Kokkos::InitializationSettings& settings); +std::optional get_gpu(const Kokkos::InitializationSettings& settings); // This declaration is provided for testing purposes only int get_ctest_gpu(int local_rank); std::vector get_visible_devices(int device_count); // test-only From 3db377e15019fa837f90f74afc9c0e83c4cb161b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 7 Feb 2024 18:00:42 -0500 Subject: [PATCH 271/432] Fixup select from visible devices --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 4 +++- core/src/HIP/Kokkos_HIP.cpp | 4 +++- core/src/OpenACC/Kokkos_OpenACC.cpp | 4 +++- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp | 4 +++- core/src/SYCL/Kokkos_SYCL.cpp | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index eec1e9867b4..f439f4fd0b7 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -532,7 +532,9 @@ int Cuda::impl_is_initialized() { } void Cuda::impl_initialize(InitializationSettings const &settings) { - const int cuda_device_id = Impl::get_gpu(settings).value_or(0); + const std::vector &visible_devices = Impl::get_visible_devices(); + const int cuda_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); cudaDeviceProp cudaProp; KOKKOS_IMPL_CUDA_SAFE_CALL( diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index 4b84ea829a8..309e07fb3fb 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -42,7 +42,9 @@ int HIP::impl_is_initialized() { } void HIP::impl_initialize(InitializationSettings const& settings) { - const int hip_device_id = Impl::get_gpu(settings).value_or(0); + const std::vector& visible_devices = Impl::get_visible_devices(); + const int hip_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( diff --git a/core/src/OpenACC/Kokkos_OpenACC.cpp b/core/src/OpenACC/Kokkos_OpenACC.cpp index 715159d1771..99daf379b6f 100644 --- a/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -58,8 +58,10 @@ void Kokkos::Experimental::OpenACC::impl_initialize( Impl::OpenACCInternal::m_acc_device_num = acc_get_device_num(acc_device_host); } else { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - int const dev_num = get_gpu(settings).value_or(0); + int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; } diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index dd831185849..3387108da39 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -178,8 +178,10 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { } void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - const int device_num = get_gpu(settings).value_or(0); + const int device_num = get_gpu(settings).value_or(visible_devices[0]); omp_set_default_device(device_num); Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 01da97c6393..580a1d4a693 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -154,7 +154,9 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { return; } #endif - const auto id = ::Kokkos::Impl::get_gpu(settings).value_or(0); + const auto& visible_devices = ::Kokkos::Impl::get_visible_devices(); + const auto id = + ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]); Impl::SYCLInternal::singleton().initialize(gpu_devices[id]); Impl::SYCLInternal::m_syclDev = id; } From 391f2d12f2d3e5f896d497a11d0752aa47be7277 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 7 Feb 2024 16:03:45 -0700 Subject: [PATCH 272/432] Fix SharedAllocationRecord to allocate using the correct execution space instance (#6789) * Fix SharedAllocationRecord * Fix SYCL --- core/src/Cuda/Kokkos_CudaSpace.hpp | 20 +++++++++++++ core/src/HIP/Kokkos_HIP_Space.hpp | 29 +++++++++++++++++++ core/src/Kokkos_HostSpace.hpp | 10 +++++++ .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 10 +++++++ core/src/SYCL/Kokkos_SYCL_Space.hpp | 20 +++++++++++++ core/src/impl/Kokkos_SharedAlloc.hpp | 8 ++--- 6 files changed, 93 insertions(+), 4 deletions(-) diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp index b0e36f1a875..0e20193e8b4 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -168,6 +168,16 @@ class CudaUVMSpace { ~CudaUVMSpace() = default; /**\brief Allocate untracked memory in the cuda space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -250,6 +260,16 @@ class CudaHostPinnedSpace { ~CudaHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index 28e5a1ccd50..7f2004e5cbc 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -65,6 +65,15 @@ class HIPSpace { ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -125,6 +134,16 @@ class HIPHostPinnedSpace { ~HIPHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -188,6 +207,16 @@ class HIPManagedSpace { ~HIPManagedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 82adb29f2fc..a1fb0f5a677 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -93,6 +93,16 @@ class HostSpace { #endif /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index 98ff7b18d0e..ed625cfcc82 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -98,6 +98,16 @@ class OpenMPTargetSpace { ~OpenMPTargetSpace() = default; /**\brief Allocate untracked memory in the space */ + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; diff --git a/core/src/SYCL/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp index f7b801f8463..b86cfca413c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -82,6 +82,16 @@ class SYCLSharedUSMSpace { SYCLSharedUSMSpace(); explicit SYCLSharedUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -113,6 +123,16 @@ class SYCLHostUSMSpace { SYCLHostUSMSpace(); explicit SYCLHostUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, diff --git a/core/src/impl/Kokkos_SharedAlloc.hpp b/core/src/impl/Kokkos_SharedAlloc.hpp index 9a9c2653de3..99ab660213f 100644 --- a/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/core/src/impl/Kokkos_SharedAlloc.hpp @@ -252,14 +252,14 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord { ~SharedAllocationRecordCommon(); template SharedAllocationRecordCommon( - ExecutionSpace const&, MemorySpace const& space, std::string const& label, - std::size_t alloc_size, + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, record_base_t::function_type dealloc = &deallocate) : SharedAllocationRecord( #ifdef KOKKOS_ENABLE_DEBUG &s_root_record, #endif - checked_allocation_with_header(space, label, alloc_size), + checked_allocation_with_header(exec, space, label, alloc_size), sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), m_space(space) { auto& header = *SharedAllocationRecord::m_alloc_ptr; @@ -315,7 +315,7 @@ class HostInaccessibleSharedAllocationRecordCommon #ifdef KOKKOS_ENABLE_DEBUG &s_root_record, #endif - checked_allocation_with_header(space, label, alloc_size), + checked_allocation_with_header(exec, space, label, alloc_size), sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), m_space(space) { SharedAllocationHeader header; From 0ed2ebfee6995ca7a481601b610c44ce479832c8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 7 Feb 2024 18:24:37 -0500 Subject: [PATCH 273/432] Make ranges non-trivial --- core/unit_test/TestOccupancyControlTrait.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/unit_test/TestOccupancyControlTrait.hpp b/core/unit_test/TestOccupancyControlTrait.hpp index ec993f3375e..7ec7a3d0f1e 100644 --- a/core/unit_test/TestOccupancyControlTrait.hpp +++ b/core/unit_test/TestOccupancyControlTrait.hpp @@ -82,11 +82,11 @@ void test_prefer_desired_occupancy(Policy policy) { #if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) TEST(TEST_CATEGORY, occupancy_control) { test_prefer_desired_occupancy(DummyPolicy{}); - test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 0)); + test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 1)); test_prefer_desired_occupancy( - Kokkos::TeamPolicy{0, Kokkos::AUTO}); + Kokkos::TeamPolicy{1, Kokkos::AUTO}); test_prefer_desired_occupancy( - Kokkos::MDRangePolicy>{{0, 0}, {0, 0}}); + Kokkos::MDRangePolicy>{{0, 0}, {1, 1}}); } #endif } // namespace From f07a537c4ac32d3f7c6494dbf935a69ec4f71b4e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 7 Feb 2024 18:36:34 -0500 Subject: [PATCH 274/432] Drop Experimental::HBWSpace --- Makefile.kokkos | 10 - Makefile.targets | 3 - cmake/KokkosCore_config.h.in | 1 - cmake/kokkos_tpls.cmake | 7 - core/src/CMakeLists.txt | 4 - core/src/HPX/Kokkos_HPX.hpp | 9 +- core/src/Kokkos_HBWSpace.hpp | 235 ---------------------- core/src/OpenMP/Kokkos_OpenMP.hpp | 16 +- core/src/decl/Kokkos_Declare_HBWSpace.hpp | 24 --- core/src/fwd/Kokkos_Fwd_HBWSpace.hpp | 29 --- core/src/impl/Kokkos_Core.cpp | 12 -- core/src/impl/Kokkos_HBWSpace.cpp | 184 ----------------- 12 files changed, 4 insertions(+), 530 deletions(-) delete mode 100644 core/src/Kokkos_HBWSpace.hpp delete mode 100644 core/src/decl/Kokkos_Declare_HBWSpace.hpp delete mode 100644 core/src/fwd/Kokkos_Fwd_HBWSpace.hpp delete mode 100644 core/src/impl/Kokkos_HBWSpace.cpp diff --git a/Makefile.kokkos b/Makefile.kokkos index 5598f19da2f..e602a37aef4 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -632,7 +632,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) KOKKOS_LIBS += -lmemkind -lnuma KOKKOS_TPL_LIBRARY_NAMES += memkind numa endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE") endif ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) @@ -1287,10 +1286,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) @@ -1401,11 +1396,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) KOKKOS_TPL_LIBRARY_NAMES += hpx endif -# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning. -ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC)) -endif - # With Cygwin functions such as fdopen and fileno are not defined # when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects diff --git a/Makefile.targets b/Makefile.targets index 6db0f2c17cc..e6900a822a8 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -121,6 +121,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp endif - -Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 76549a31195..655f4fec9c4 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -58,7 +58,6 @@ /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC #cmakedefine KOKKOS_USE_LIBRT -#cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH #cmakedefine KOKKOS_ENABLE_ONEDPL diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index f8cb90d6dcf..5c59a8e95d5 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -33,9 +33,6 @@ ENDFUNCTION() KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) KOKKOS_TPL_OPTION(MEMKIND Off) -IF(KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE ON) -ENDIF() KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) KOKKOS_TPL_OPTION(LIBRT Off) IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT @@ -117,7 +114,3 @@ STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable UNSET(KOKKOS_TPL_EXPORTS CACHE) SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) -IF (KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE) - LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) -ENDIF() diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 1943f7e5485..4d3eb3c1619 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -86,10 +86,6 @@ IF (KOKKOS_ENABLE_HPX) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) ENDIF() -IF (NOT KOKKOS_ENABLE_MEMKIND) - LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_HBWSpace.cpp) -ENDIF() - IF (KOKKOS_ENABLE_SERIAL) APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index e444f746e7a..26181a7c05d 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -27,14 +27,6 @@ static_assert(false, #include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -59,6 +51,7 @@ static_assert(false, #include +#include #include #include #include diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp deleted file mode 100644 index 4400bb77606..00000000000 --- a/core/src/Kokkos_HBWSpace.hpp +++ /dev/null @@ -1,235 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_HBWSPACE_HPP -#define KOKKOS_HBWSPACE_HPP - -#include -#ifdef KOKKOS_ENABLE_HBWSPACE - -#include -#include - -namespace Kokkos { - -namespace Experimental { - -/// \class HBWSpace -/// \brief Memory management for host memory. -/// -/// HBWSpace is a memory space that governs host memory. "Host" -/// memory means the usual CPU-accessible memory. -class HBWSpace { - public: - //! Tag this class as a kokkos memory space - using memory_space = HBWSpace; - using size_type = size_t; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - using execution_space = Kokkos::DefaultHostExecutionSpace; - - //! This memory space preferred device_type - using device_type = Kokkos::Device; - - /**\brief Default memory space instance */ - HBWSpace(); - HBWSpace(const HBWSpace& rhs) = default; - HBWSpace& operator=(const HBWSpace&) = default; - ~HBWSpace() = default; - - /**\brief Non-default memory space instance to choose allocation mechansim, - * if available */ - - enum AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; - - explicit HBWSpace(const AllocationMechanism&); - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const; - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - private: - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - - public: - /**\brief Return Name of the MemorySpace */ - static constexpr const char* name() { return "HBW"; } - - private: - AllocationMechanism m_alloc_mech; -}; - -} // namespace Experimental - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::Experimental::HBWSpace); - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::HBWSpace, - Kokkos::Experimental::HBWSpace>::assignable); - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = true }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = false }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(copy_space, dst, src, n); - } -}; - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); - } -}; - -} // namespace Impl - -} // namespace Kokkos - -#endif -#endif // #define KOKKOS_HBWSPACE_HPP diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index 404076d0111..11292af84ad 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -27,14 +27,7 @@ static_assert(false, #include -#include -#include #include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -45,6 +38,8 @@ static_assert(false, #include +#include +#include #include /*--------------------------------------------------------------------------*/ @@ -62,12 +57,7 @@ class OpenMP { //! Tag this class as a kokkos execution space using execution_space = OpenMP; - using memory_space = -#ifdef KOKKOS_ENABLE_HBWSPACE - Experimental::HBWSpace; -#else - HostSpace; -#endif + using memory_space = HostSpace; //! This execution space preferred device_type using device_type = Kokkos::Device; diff --git a/core/src/decl/Kokkos_Declare_HBWSpace.hpp b/core/src/decl/Kokkos_Declare_HBWSpace.hpp deleted file mode 100644 index 1328c931352..00000000000 --- a/core/src/decl/Kokkos_Declare_HBWSpace.hpp +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_DECLARE_HBWSPACE_HPP -#define KOKKOS_DECLARE_HBWSPACE_HPP - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - -#endif diff --git a/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp b/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp deleted file mode 100644 index 21ba7fad01c..00000000000 --- a/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp +++ /dev/null @@ -1,29 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_HBWSPACE_FWD_HPP_ -#define KOKKOS_HBWSPACE_FWD_HPP_ - -#ifdef KOKKOS_ENABLE_HBWSPACE -namespace Kokkos { - -namespace Experimental { -class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL - /// processor) -} // namespace Experimental -} // namespace Kokkos -#endif -#endif diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 0229da88923..11f09eccc5f 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -447,13 +447,6 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { namespace { void initialize_backends(const Kokkos::InitializationSettings& settings) { -// This is an experimental setting -// For KNL in Flat mode this variable should be set, so that -// memkind allocates high bandwidth memory correctly. -#ifdef KOKKOS_ENABLE_HBWSPACE - setenv("MEMKIND_HBW_NODES", "1", 0); -#endif - Kokkos::Impl::ExecSpaceManager::get_instance().initialize_spaces(settings); } @@ -573,11 +566,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "no"); #endif -#ifdef KOKKOS_ENABLE_HBWSPACE - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no"); -#endif #ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", "yes"); diff --git a/core/src/impl/Kokkos_HBWSpace.cpp b/core/src/impl/Kokkos_HBWSpace.cpp deleted file mode 100644 index 947bef1253b..00000000000 --- a/core/src/impl/Kokkos_HBWSpace.cpp +++ /dev/null @@ -1,184 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#ifdef KOKKOS_ENABLE_HBWSPACE -#define MEMKIND_TYPE MEMKIND_HBW // hbw_get_kind(HBW_PAGESIZE_4KB) - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Experimental { - -/* Default allocation mechanism */ -HBWSpace::HBWSpace() : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); -} - -/* Default allocation mechanism */ -HBWSpace::HBWSpace(const HBWSpace::AllocationMechanism &arg_alloc_mech) - : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init2\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); - if (arg_alloc_mech == STD_MALLOC) { - m_alloc_mech = HBWSpace::STD_MALLOC; - } -} - -void *HBWSpace::allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); -} -void *HBWSpace::allocate(const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); -} -void *HBWSpace::impl_allocate( - const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - static_assert(sizeof(void *) == sizeof(uintptr_t), - "Error sizeof(void*) != sizeof(uintptr_t)"); - - static_assert( - Kokkos::Impl::power_of_two::value, - "Memory alignment must be power of two"); - - constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; - constexpr uintptr_t alignment_mask = alignment - 1; - - void *ptr = nullptr; - - if (arg_alloc_size) { - if (m_alloc_mech == STD_MALLOC) { - // Over-allocate to and round up to guarantee proper alignment. - size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; - - void *alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded); - - if (alloc_ptr) { - uintptr_t address = reinterpret_cast(alloc_ptr); - - // offset enough to record the alloc_ptr - address += sizeof(void *); - uintptr_t rem = address % alignment; - uintptr_t offset = rem ? (alignment - rem) : 0u; - address += offset; - ptr = reinterpret_cast(address); - // record the alloc'd pointer - address -= sizeof(void *); - *reinterpret_cast(address) = alloc_ptr; - } - } - } - - if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || - (reinterpret_cast(ptr) & alignment_mask)) { - std::ostringstream msg; - msg << "Kokkos::Experimental::HBWSpace::allocate[ "; - switch (m_alloc_mech) { - case STD_MALLOC: msg << "STD_MALLOC"; break; - case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN"; break; - case POSIX_MMAP: msg << "POSIX_MMAP"; break; - case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC"; break; - } - msg << " ]( " << arg_alloc_size << " ) FAILED"; - if (ptr == nullptr) { - msg << " nullptr"; - } else { - msg << " NOT ALIGNED " << ptr; - } - - std::cerr << msg.str() << std::endl; - std::cerr.flush(); - - Kokkos::Impl::throw_runtime_exception(msg.str()); - } - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); - } - - return ptr; -} - -void HBWSpace::deallocate(void *const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); -} -void HBWSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); -} -void HBWSpace::impl_deallocate( - const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - if (arg_alloc_ptr) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, - reported_size); - } - - if (m_alloc_mech == STD_MALLOC) { - void *alloc_ptr = *(reinterpret_cast(arg_alloc_ptr) - 1); - memkind_free(MEMKIND_TYPE, alloc_ptr); - } - } -} - -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include - -KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::HBWSpace); - -#endif From e28b5797628ea7ca6d0c2a7e13d7d100acf73a4a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 7 Feb 2024 18:38:31 -0500 Subject: [PATCH 275/432] Fixup bogous shared alloc fence labels mentioning HBWSpace --- core/src/impl/Kokkos_SharedAlloc_timpl.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp index c903180dea5..41036ab0678 100644 --- a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp +++ b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -136,9 +136,8 @@ void* SharedAllocationRecordCommon::reallocate_tracked( Kokkos::Impl::DeepCopy( r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); record_base_t::increment(r_new); record_base_t::decrement(r_old); @@ -193,9 +192,8 @@ void* HostInaccessibleSharedAllocationRecordCommon< Kokkos::Impl::DeepCopy( r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); record_base_t::increment(r_new); record_base_t::decrement(r_old); From 136360bb31a72dda965b86608cfbbae94790f49b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 7 Feb 2024 18:41:06 -0500 Subject: [PATCH 276/432] Restore TestCommonPolicyConstructors.hpp --- core/unit_test/TestCommonPolicyConstructors.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestCommonPolicyConstructors.hpp b/core/unit_test/TestCommonPolicyConstructors.hpp index 8ec348e47c9..ec5d1ae0f46 100644 --- a/core/unit_test/TestCommonPolicyConstructors.hpp +++ b/core/unit_test/TestCommonPolicyConstructors.hpp @@ -66,16 +66,19 @@ void test_prefer_desired_occupancy() { prefer(policy_still_no_occ, DesiredOccupancy{33}); static_assert( decltype(policy_with_occ)::experimental_contains_desired_occupancy); + EXPECT_EQ(policy_with_occ.impl_get_desired_occupancy().value(), 33); // DesiredOccupancy -> DesiredOccupancy auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); static_assert( decltype(policy_change_occ)::experimental_contains_desired_occupancy); + EXPECT_EQ(policy_change_occ.impl_get_desired_occupancy().value(), 24); // DesiredOccupancy -> DesiredOccupancy w/ hint auto policy_with_occ_and_hint = Kokkos::Experimental::require( policy_change_occ, Kokkos::Experimental::WorkItemProperty::HintLightWeight); + EXPECT_EQ(policy_with_occ_and_hint.impl_get_desired_occupancy().value(), 24); // DesiredOccupancy -> MaximizeOccupancy auto const policy_drop_occ = @@ -84,7 +87,7 @@ void test_prefer_desired_occupancy() { !decltype(policy_drop_occ)::experimental_contains_desired_occupancy); } -[[maybe_unused]] void test_execution_policy_occupancy_and_hint() { +TEST(TEST_CATEGORY, execution_policy_occupancy_and_hint) { test_prefer_desired_occupancy>(); test_prefer_desired_occupancy>(); test_prefer_desired_occupancy>(); From 473cd53136d2f116626cb64face2383eb21f1913 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 7 Feb 2024 18:42:02 -0500 Subject: [PATCH 277/432] Remove DummyPolicy --- core/unit_test/TestOccupancyControlTrait.hpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/core/unit_test/TestOccupancyControlTrait.hpp b/core/unit_test/TestOccupancyControlTrait.hpp index 7ec7a3d0f1e..345a906d668 100644 --- a/core/unit_test/TestOccupancyControlTrait.hpp +++ b/core/unit_test/TestOccupancyControlTrait.hpp @@ -18,12 +18,6 @@ namespace { -// Dummy policy for testing base class. -template -struct DummyPolicy : Kokkos::Impl::PolicyTraits { - using execution_policy = DummyPolicy; -}; - template void test_policy_execution(const Kokkos::RangePolicy& policy) { Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); @@ -39,8 +33,6 @@ template void test_policy_execution(const Kokkos::MDRangePolicy& policy) { Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); } -template -void test_policy_execution(const DummyPolicy&) {} template void test_prefer_desired_occupancy(Policy policy) { @@ -56,19 +48,16 @@ void test_prefer_desired_occupancy(Policy policy) { // MaximizeOccupancy -> DesiredOccupancy auto const policy_with_occ = prefer(policy_still_no_occ, DesiredOccupancy{33}); - EXPECT_EQ(policy_with_occ.impl_get_desired_occupancy().value(), 33); test_policy_execution(policy_with_occ); // DesiredOccupancy -> DesiredOccupancy auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); - EXPECT_EQ(policy_change_occ.impl_get_desired_occupancy().value(), 24); test_policy_execution(policy_change_occ); // DesiredOccupancy -> DesiredOccupancy w/ hint auto policy_with_occ_and_hint = Kokkos::Experimental::require( policy_change_occ, Kokkos::Experimental::WorkItemProperty::HintLightWeight); - EXPECT_EQ(policy_with_occ_and_hint.impl_get_desired_occupancy().value(), 24); test_policy_execution(policy_with_occ_and_hint); // DesiredOccupancy -> MaximizeOccupancy @@ -81,7 +70,6 @@ void test_prefer_desired_occupancy(Policy policy) { // This test doesn't compile with CUDA on Windows #if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) TEST(TEST_CATEGORY, occupancy_control) { - test_prefer_desired_occupancy(DummyPolicy{}); test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 1)); test_prefer_desired_occupancy( Kokkos::TeamPolicy{1, Kokkos::AUTO}); From 95f70b3f4c843f0816fda8e824663667ebe3f460 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 7 Feb 2024 18:44:36 -0500 Subject: [PATCH 278/432] Remove support for memkind --- Makefile.kokkos | 17 +---------------- cmake/KokkosCore_config.h.in | 1 - cmake/Modules/FindTPLMEMKIND.cmake | 1 - cmake/kokkos_tpls.cmake | 2 -- core/src/CMakeLists.txt | 1 - generate_makefile.bash | 18 ++---------------- gnu_generate_makefile.bash | 9 --------- scripts/testing_scripts/generate_makefile.bash | 9 --------- 8 files changed, 3 insertions(+), 55 deletions(-) delete mode 100644 cmake/Modules/FindTPLMEMKIND.cmake diff --git a/Makefile.kokkos b/Makefile.kokkos index e602a37aef4..374ce95cd7e 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -19,7 +19,7 @@ KOKKOS_DEVICES ?= "Threads" KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" -# Options: hwloc,librt,experimental_memkind +# Options: hwloc,librt KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" @@ -69,7 +69,6 @@ KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) -KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) @@ -620,20 +619,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) KOKKOS_TPL_LIBRARY_NAMES += rt endif -ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - ifneq ($(KOKKOS_CMAKE), yes) - ifneq ($(MEMKIND_PATH),) - KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include - KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib - KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include - KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib - endif - KOKKOS_LIBS += -lmemkind -lnuma - KOKKOS_TPL_LIBRARY_NAMES += memkind numa - endif -endif - ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") endif diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 655f4fec9c4..34b58fffa86 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -23,7 +23,6 @@ #cmakedefine KOKKOS_ENABLE_CUDA #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX -#cmakedefine KOKKOS_ENABLE_MEMKIND #cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL #cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED diff --git a/cmake/Modules/FindTPLMEMKIND.cmake b/cmake/Modules/FindTPLMEMKIND.cmake deleted file mode 100644 index 20aaff22955..00000000000 --- a/cmake/Modules/FindTPLMEMKIND.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(MEMKIND HEADER memkind.h LIBRARY memkind) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index 5c59a8e95d5..f763f9b9054 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -32,7 +32,6 @@ FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) ENDFUNCTION() KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(MEMKIND Off) KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) KOKKOS_TPL_OPTION(LIBRT Off) IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT @@ -78,7 +77,6 @@ KOKKOS_IMPORT_TPL(CUDA INTERFACE) KOKKOS_IMPORT_TPL(HWLOC) KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) -KOKKOS_IMPORT_TPL(MEMKIND) IF (NOT WIN32) KOKKOS_IMPORT_TPL(THREADS INTERFACE) ENDIF() diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 4d3eb3c1619..ebfa3c899b7 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -182,7 +182,6 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) ENDIF() KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND) KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) diff --git a/generate_makefile.bash b/generate_makefile.bash index 301a1fceb5a..47c25a2850b 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -226,7 +226,6 @@ display_help_text() { echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -342,10 +341,6 @@ do KOKKOS_HWLOC=ON HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - KOKKOS_MEMKIND=ON - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -452,15 +447,6 @@ else KOKKOS_HWLOC_CMD= fi -if [ "$KOKKOS_MEMKIND" == "ON" ]; then - KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON - if [ "$MEMKIND_PATH" != "" ]; then - KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH - fi -else - KOKKOS_MEMKIND_CMD= -fi - if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then if [ "${KOKKOS_PATH}" == "" ]; then CM_SCRIPT=$0 @@ -506,5 +492,5 @@ if [[ ${COMPILER} == *clang* ]]; then fi fi -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} diff --git a/gnu_generate_makefile.bash b/gnu_generate_makefile.bash index 5ea159cdd47..8cf28eef7e5 100755 --- a/gnu_generate_makefile.bash +++ b/gnu_generate_makefile.bash @@ -74,9 +74,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -198,7 +195,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -298,11 +294,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/scripts/testing_scripts/generate_makefile.bash b/scripts/testing_scripts/generate_makefile.bash index ae1db3186f7..44541815f26 100755 --- a/scripts/testing_scripts/generate_makefile.bash +++ b/scripts/testing_scripts/generate_makefile.bash @@ -59,9 +59,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -177,7 +174,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -269,11 +265,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi From 1502379d035e5e5d3b9c9d9d644cf9d513a3c119 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Wed, 7 Feb 2024 13:58:04 -0700 Subject: [PATCH 279/432] Added missing copy_from() in neon for vector_aligned --- simd/src/Kokkos_SIMD_NEON.hpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 815e193d049..efc81135d16 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -615,10 +615,18 @@ class simd> { element_aligned_tag) { m_value = vld1_f32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_f32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_f32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_f32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float32x2_t() const { return m_value; @@ -1281,7 +1289,14 @@ class simd> { vector_aligned_tag) { m_value = vld1q_u64(ptr); } - + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_u64(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_u64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() const { return m_value; @@ -1513,6 +1528,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1547,6 +1567,10 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + void copy_from(float const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, From 3496c6fde32d0f8024a6d8ed74a2235945fea00e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 8 Feb 2024 09:48:05 -0500 Subject: [PATCH 280/432] Remove stray include header --- core/src/impl/Kokkos_Core.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index e51782f836b..2866746d07f 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -16,7 +16,6 @@ #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE -#include #endif #include From 3e405209dc52b6629e3c2e2ae9af10d5a435fa7b Mon Sep 17 00:00:00 2001 From: Patrick Diehl Date: Thu, 8 Feb 2024 10:28:08 -0600 Subject: [PATCH 281/432] Add support for RISCV and the Milk-V's Pioneer (#6773) Add RISC-V Milk-V's Pioneer to the arch list --------- Co-authored-by: Christian Trott Co-authored-by: Damien L-G Co-authored-by: Damien L-G Co-authored-by: Daniel Arndt --- cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_arch.cmake | 18 ++++++++++++++++++ core/src/impl/Kokkos_Core.cpp | 7 +++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 34b58fffa86..2a6364d6399 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -76,6 +76,7 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_RISCV_SG2042 #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 575c9b70b1c..a7c6357b108 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -67,6 +67,7 @@ DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") +DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) SET(KOKKOS_SHOW_CUDA_ARCHS ON) @@ -375,6 +376,23 @@ IF (KOKKOS_ARCH_HSW) ) ENDIF() +IF (KOKKOS_ARCH_RISCV_SG2042) + IF(NOT + (KOKKOS_CXX_COMPILER_ID STREQUAL GNU + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR + (KOKKOS_CXX_COMPILER_ID STREQUAL Clang + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) + ) + MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + ENDIF() + COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + DEFAULT -march=rv64imafdcv + ) +ENDIF() + + IF (KOKKOS_ARCH_BDW) SET(KOKKOS_ARCH_AVX2 ON) COMPILER_SPECIFIC_FLAGS( diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 11f09eccc5f..05472ecd244 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -678,6 +678,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_ZEN3) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN3"); +#elif defined(KOKKOS_ARCH_RISCV_SG2042) + declare_configuration_metadata("architecture", "CPU architecture", + "SG2042 (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -756,8 +759,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); #elif defined(KOKKOS_ARCH_AMD_GFX90A) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX90A"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX90A"); #elif defined(KOKKOS_ARCH_AMD_GFX1030) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX1030"); From 7ff87a5b25af620c71ac13275f4875845334687c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 8 Feb 2024 12:29:54 -0700 Subject: [PATCH 282/432] SYCL: Filter GPU devices (#6758) * SYCL: Filter GPU devices * Error out if no GPU was found * Move definition of get_sycl_devices() to Kokkos_SYCL.cpp * Don't error out when no GPUs are available --- core/src/SYCL/Kokkos_SYCL.cpp | 27 ++++++++++++++++++++++++--- core/src/SYCL/Kokkos_SYCL.hpp | 4 ++++ core/src/impl/Kokkos_Core.cpp | 2 +- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 580a1d4a693..a4c844fbed4 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -142,8 +142,8 @@ void SYCL::impl_static_fence(const std::string& name) { } void SYCL::impl_initialize(InitializationSettings const& settings) { - std::vector gpu_devices = - sycl::device::get_devices(sycl::info::device_type::gpu); + std::vector gpu_devices = Impl::get_sycl_devices(); + // If the device id is not specified and there are no GPUs, sidestep Kokkos // device selection and use whatever is available (if no GPU architecture is // specified). @@ -250,9 +250,30 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, namespace Impl { +std::vector get_sycl_devices() { + std::vector gpu_devices = + sycl::device::get_devices(sycl::info::device_type::gpu); +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) || \ + defined(KOKKOS_ARCH_AMD_GPU) +#if defined(KOKKOS_ARCH_INTEL_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_level_zero; +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_cuda; +#elif defined(KOKKOS_ARCH_AMD_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_hip; +#endif + gpu_devices.erase(std::remove_if(gpu_devices.begin(), gpu_devices.end(), + [backend](const sycl::device& d) { + return d.get_backend() != backend; + }), + gpu_devices.end()); +#endif + return gpu_devices; +} + int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); -} +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/core/src/SYCL/Kokkos_SYCL.hpp b/core/src/SYCL/Kokkos_SYCL.hpp index 47756b039bd..0f3d1f0994d 100644 --- a/core/src/SYCL/Kokkos_SYCL.hpp +++ b/core/src/SYCL/Kokkos_SYCL.hpp @@ -184,6 +184,10 @@ std::vector partition_space(const SYCL& sycl_space, sycl::queue(context, device, sycl::property::queue::in_order())); return instances; } + +namespace Impl { +std::vector get_sycl_devices(); +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index a872efef50d..efaf400c9ad 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -137,7 +137,7 @@ int get_device_count() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); return count; #elif defined(KOKKOS_ENABLE_SYCL) - return sycl::device::get_devices(sycl::info::device_type::gpu).size(); + return Kokkos::Experimental::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); From 21b1105421f705cf0de38479b4647392483c70ff Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 8 Feb 2024 18:35:53 -0500 Subject: [PATCH 283/432] Drop KOKKOS_ENABLE_INTEL_MM_ALLOC macro --- core/src/impl/Kokkos_Core.cpp | 8 -------- core/src/impl/Kokkos_HostSpace.cpp | 19 ++----------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index efaf400c9ad..ec6b66ea4f4 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -568,14 +568,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "no"); #endif -#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "no"); -#endif - #ifdef KOKKOS_ENABLE_ASM declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes"); #else diff --git a/core/src/impl/Kokkos_HostSpace.cpp b/core/src/impl/Kokkos_HostSpace.cpp index 6064a595f9c..1047b773d77 100644 --- a/core/src/impl/Kokkos_HostSpace.cpp +++ b/core/src/impl/Kokkos_HostSpace.cpp @@ -20,22 +20,11 @@ #include +#include +#include #include #include -/*--------------------------------------------------------------------------*/ - -#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \ - !defined(KOKKOS_ENABLE_CUDA) - -// Intel specialized allocator does not interoperate with CUDA memory allocation - -#define KOKKOS_ENABLE_INTEL_MM_ALLOC - -#endif - -/*--------------------------------------------------------------------------*/ - #include #include #include @@ -49,10 +38,6 @@ #include #endif -#include -#include -#include - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- From aced864ecc123ca9423222f99ddf411f34bcddc0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 8 Feb 2024 22:45:15 -0500 Subject: [PATCH 284/432] Drop librt TPL and associated KOKKOS_ENABLE_LIBRT macro --- Makefile.kokkos | 11 ++--------- cmake/KokkosCore_config.h.in | 2 -- cmake/Modules/FindTPLLIBRT.cmake | 1 - cmake/kokkos_tpls.cmake | 2 -- core/src/CMakeLists.txt | 1 - core/src/impl/Kokkos_Core.cpp | 5 ----- 6 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 cmake/Modules/FindTPLLIBRT.cmake diff --git a/Makefile.kokkos b/Makefile.kokkos index 374ce95cd7e..e8d4cf752c1 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -19,7 +19,7 @@ KOKKOS_DEVICES ?= "Threads" KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" -# Options: hwloc,librt +# Options: hwloc KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" @@ -46,7 +46,7 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$( uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT) # Return a 1 if a string contains a substring and 0 if not # Note the search string should be without '"' -# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc) # Will return a 1 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0) # Returns 1 if the path exists, 0 otherwise @@ -68,7 +68,6 @@ KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) -KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) @@ -613,12 +612,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC") endif -ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT") - KOKKOS_LIBS += -lrt - KOKKOS_TPL_LIBRARY_NAMES += rt -endif - ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") endif diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 2a6364d6399..37f62980c61 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -23,7 +23,6 @@ #cmakedefine KOKKOS_ENABLE_CUDA #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX -#cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL #cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED @@ -56,7 +55,6 @@ /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC -#cmakedefine KOKKOS_USE_LIBRT #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH #cmakedefine KOKKOS_ENABLE_ONEDPL diff --git a/cmake/Modules/FindTPLLIBRT.cmake b/cmake/Modules/FindTPLLIBRT.cmake deleted file mode 100644 index e75da56b5b5..00000000000 --- a/cmake/Modules/FindTPLLIBRT.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(LIBRT HEADER time.h LIBRARY rt) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index f763f9b9054..f80d724f7f4 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -33,7 +33,6 @@ ENDFUNCTION() KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -KOKKOS_TPL_OPTION(LIBRT Off) IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_HAS_TRILINOS) SET(ROCM_DEFAULT ON) @@ -75,7 +74,6 @@ KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) KOKKOS_IMPORT_TPL(HPX INTERFACE) KOKKOS_IMPORT_TPL(CUDA INTERFACE) KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) IF (NOT WIN32) KOKKOS_IMPORT_TPL(THREADS INTERFACE) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index ebfa3c899b7..7b7c31a6baa 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -185,7 +185,6 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread IF (NOT WIN32) KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index efaf400c9ad..541fda1f91a 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -613,11 +613,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no"); #endif -#ifdef KOKKOS_ENABLE_LIBRT - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes"); -#else - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no"); -#endif #ifdef KOKKOS_ENABLE_LIBDL declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "yes"); #else From 3b515c99e3efc5d643199f06ca569afe183cf032 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 9 Feb 2024 07:02:33 -0700 Subject: [PATCH 285/432] Cuda multi-GPU support: Pass the correct device id to get_cuda_kernel_func_attributes (#6767) * Pass the correct device id to get_cuda_kernel_func_attributes * Address review comments * Fix another occurence * Fix configure_shmem_preference --- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 40 +++++++++---------- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 9 ++--- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 9 ++--- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 15 ++++--- 4 files changed, 35 insertions(+), 38 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index fbdfc149011..b0dadb45f72 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -128,32 +128,32 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { // These functions need to be templated on DriverType and LaunchBounds // so that the static bool is unique for each type combo // KernelFuncPtr does not necessarily contain that type information. -// FIXME_CUDA_MULTIPLE_DEVICES template const cudaFuncAttributes& get_cuda_kernel_func_attributes( - const KernelFuncPtr& func) { + int cuda_device, const KernelFuncPtr& func) { // Only call cudaFuncGetAttributes once for each unique kernel // by leveraging static variable initialization rules - auto wrap_get_attributes = [&]() -> cudaFuncAttributes { + static std::map func_attr; + if (func_attr.find(cuda_device) == func_attr.end()) { cudaFuncAttributes attr; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr, - func))); - return attr; - }; - static cudaFuncAttributes func_attr = wrap_get_attributes(); - return func_attr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); + func_attr.emplace(cuda_device, attr); + } + return func_attr[cuda_device]; } template -inline void configure_shmem_preference(const KernelFuncPtr& func, +inline void configure_shmem_preference(const int cuda_device, + const KernelFuncPtr& func, const cudaDeviceProp& device_props, const size_t block_size, int& shmem, const size_t occupancy) { #ifndef KOKKOS_ARCH_KEPLER const auto& func_attr = - get_cuda_kernel_func_attributes(func); + get_cuda_kernel_func_attributes(cuda_device, + func); // Compute limits for number of blocks due to registers/SM const size_t regs_per_sm = device_props.regsPerMultiprocessor; @@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } void const* args[] = {&driver}; @@ -487,8 +487,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -668,8 +668,8 @@ struct CudaParallelLaunchImpl< Impl::configure_shmem_preference< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } desul::ensure_cuda_lock_arrays_on_device(); @@ -685,10 +685,10 @@ struct CudaParallelLaunchImpl< } } - static cudaFuncAttributes get_cuda_func_attributes() { + static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) { return get_cuda_kernel_func_attributes< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func()); + cuda_device, base_t::get_kernel_func()); } }; diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index fa804556d6e..63038984004 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -41,8 +41,8 @@ namespace Impl { template int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + pol.space().cuda_device()); auto const& prop = pol.space().cuda_device_prop(); // Limits due to registers/SM, MDRange doesn't have @@ -332,9 +332,8 @@ class ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( (n && (maxShmemPerBlock < shmem_size)) || (n > diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 84889642ab2..0f052be3c30 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -85,8 +85,8 @@ class ParallelFor, Kokkos::Cuda> { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + m_policy.space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( m_policy.space().impl_internal_space_instance(), attr, m_functor, 1, @@ -267,9 +267,8 @@ class ParallelReduce, using closure_type = Impl::ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( (n && (maxShmemPerBlock < shmem_size)) || (n > diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 63b64bf9876..9f7be45c839 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -98,7 +98,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); int block_size = Kokkos::Impl::cuda_get_max_block_size( @@ -137,7 +137,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( @@ -370,7 +370,7 @@ class TeamPolicyInternal cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, (size_t)impl_vector_length(), @@ -540,8 +540,8 @@ class ParallelFor, auto internal_space_instance = m_policy.space().impl_internal_space_instance(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size @@ -909,9 +909,8 @@ class ParallelReduce::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size From 5b86415d682b159d6ea3ca3558f37803e82c5e48 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 9 Feb 2024 22:08:14 -0500 Subject: [PATCH 286/432] Drop IBM Blue Gene/Q and POWER7 architectures --- Makefile.kokkos | 6 ++---- cmake/kokkos_arch.cmake | 2 -- core/src/impl/Kokkos_Core.cpp | 4 ---- .../configuration/test-code/test_config_arch_list.bash | 2 +- generate_makefile.bash | 2 -- gnu_generate_makefile.bash | 2 -- scripts/testing_scripts/generate_makefile.bash | 2 -- 7 files changed, 3 insertions(+), 17 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index e8d4cf752c1..2a3a9e5d73c 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -12,7 +12,7 @@ KOKKOS_DEVICES ?= "Threads" # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX -# IBM: BGQ,Power7,Power8,Power9 +# IBM: Power8,Power9 # AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC @@ -388,11 +388,9 @@ KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) # IBM based. -KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) -KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) -KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index a7c6357b108..b221407059e 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -60,8 +60,6 @@ DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (A DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(BGQ "IBM Blue Gene Q") -DECLARE_AND_CHECK_HOST_ARCH(POWER7 "IBM POWER7 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index fd49e34ebce..b858003fdba 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -629,8 +629,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "ARMV8_THUNDERX2"); #elif defined(KOKKOS_ARCH_BDW) declare_configuration_metadata("architecture", "CPU architecture", "BDW"); -#elif defined(KOKKOS_ARCH_BGQ) - declare_configuration_metadata("architecture", "CPU architecture", "BGQ"); #elif defined(KOKKOS_ARCH_HSW) declare_configuration_metadata("architecture", "CPU architecture", "HSW"); #elif defined(KOKKOS_ARCH_ICL) @@ -643,8 +641,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "KNL"); #elif defined(KOKKOS_ARCH_NATIVE) declare_configuration_metadata("architecture", "CPU architecture", "NATIVE"); -#elif defined(KOKKOS_ARCH_POWER7) - declare_configuration_metadata("architecture", "CPU architecture", "POWER7"); #elif defined(KOKKOS_ARCH_POWER8) declare_configuration_metadata("architecture", "CPU architecture", "POWER8"); #elif defined(KOKKOS_ARCH_POWER9) diff --git a/core/unit_test/configuration/test-code/test_config_arch_list.bash b/core/unit_test/configuration/test-code/test_config_arch_list.bash index 8fe8e2b5ece..743e72ca21b 100755 --- a/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(WSM SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/generate_makefile.bash b/generate_makefile.bash index 47c25a2850b..df9b1b65342 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -170,8 +170,6 @@ display_help_text() { echo " ARMV8_THUNDERX = ARMv8 Cavium ThunderX CPU" echo " ARMV8_THUNDERX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" diff --git a/gnu_generate_makefile.bash b/gnu_generate_makefile.bash index 8cf28eef7e5..cea1138dc92 100755 --- a/gnu_generate_makefile.bash +++ b/gnu_generate_makefile.bash @@ -145,8 +145,6 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" diff --git a/scripts/testing_scripts/generate_makefile.bash b/scripts/testing_scripts/generate_makefile.bash index 44541815f26..ef247c5a413 100755 --- a/scripts/testing_scripts/generate_makefile.bash +++ b/scripts/testing_scripts/generate_makefile.bash @@ -133,8 +133,6 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" From 17d07425941c08fe84c0e197d68110620a2c65fa Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 9 Feb 2024 22:09:22 -0500 Subject: [PATCH 287/432] Drop Intel Westmere and SSE4.2 extension --- Makefile.kokkos | 18 +----------------- cmake/KokkosCore_config.h.in | 1 - cmake/kokkos_arch.cmake | 13 ------------- core/src/impl/Kokkos_Core.cpp | 10 ++++------ .../test-code/test_config_arch_list.bash | 2 +- generate_makefile.bash | 1 - gnu_generate_makefile.bash | 1 - scripts/testing_scripts/generate_makefile.bash | 1 - 8 files changed, 6 insertions(+), 41 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 2a3a9e5d73c..469178e80d7 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -308,7 +308,6 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) -KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) @@ -410,13 +409,12 @@ KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_A KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) # Any AVX? -KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -800,20 +798,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) endif endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42") - - ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) - KOKKOS_CXXFLAGS += -xSSE4.2 - KOKKOS_LDFLAGS += -xSSE4.2 - else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - else - # Assume that this is a really a GNU compiler. - KOKKOS_CXXFLAGS += -msse4.2 - KOKKOS_LDFLAGS += -msse4.2 - endif -endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX") diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 37f62980c61..33ebfd266b4 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -59,7 +59,6 @@ #cmakedefine KOKKOS_ENABLE_LIBQUADMATH #cmakedefine KOKKOS_ENABLE_ONEDPL -#cmakedefine KOKKOS_ARCH_SSE42 #cmakedefine KOKKOS_ARCH_ARMV80 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index b221407059e..5ee6f44613e 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -49,7 +49,6 @@ DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(WSM "Intel Westmere CPU") DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") @@ -338,18 +337,6 @@ IF (KOKKOS_ARCH_ZEN3) SET(KOKKOS_ARCH_AVX2 ON) ENDIF() -IF (KOKKOS_ARCH_WSM) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSSE4.2 - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=px - DEFAULT -msse4.2 - ) - SET(KOKKOS_ARCH_SSE42 ON) -ENDIF() - IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) SET(KOKKOS_ARCH_AVX ON) COMPILER_SPECIFIC_FLAGS( diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index b858003fdba..4a696526161 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -653,8 +653,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "SNB"); #elif defined(KOKKOS_ARCH_SPR) declare_configuration_metadata("architecture", "CPU architecture", "SPR"); -#elif defined(KOKKOS_ARCH_WSM) - declare_configuration_metadata("architecture", "CPU architecture", "WSM"); #elif defined(KOKKOS_ARCH_AMD_ZEN) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN"); #elif defined(KOKKOS_ARCH_AMD_ZEN2) @@ -735,8 +733,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_ADA89) declare_configuration_metadata("architecture", "GPU architecture", "ADA89"); #elif defined(KOKKOS_ARCH_HOPPER90) - declare_configuration_metadata("architecture", "GPU architecture", - "HOPPER90"); + declare_configuration_metadata("architecture", "GPU architecture", + "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX906"); @@ -744,8 +742,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); #elif defined(KOKKOS_ARCH_AMD_GFX90A) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX90A"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX90A"); #elif defined(KOKKOS_ARCH_AMD_GFX1030) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX1030"); diff --git a/core/unit_test/configuration/test-code/test_config_arch_list.bash b/core/unit_test/configuration/test-code/test_config_arch_list.bash index 743e72ca21b..8bc8ef21cd0 100755 --- a/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/generate_makefile.bash b/generate_makefile.bash index df9b1b65342..25370daa3f2 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -173,7 +173,6 @@ display_help_text() { echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" diff --git a/gnu_generate_makefile.bash b/gnu_generate_makefile.bash index cea1138dc92..7a197bb71d4 100755 --- a/gnu_generate_makefile.bash +++ b/gnu_generate_makefile.bash @@ -148,7 +148,6 @@ do echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" diff --git a/scripts/testing_scripts/generate_makefile.bash b/scripts/testing_scripts/generate_makefile.bash index ef247c5a413..830d7b12d90 100755 --- a/scripts/testing_scripts/generate_makefile.bash +++ b/scripts/testing_scripts/generate_makefile.bash @@ -136,7 +136,6 @@ do echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" From 37962b3d2f9a26ce3fdcb80229ab3a73106f456b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 9 Feb 2024 20:43:26 +0000 Subject: [PATCH 288/432] SYCL: Cleanup device selection --- core/src/SYCL/Kokkos_SYCL.cpp | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index a4c844fbed4..9bc1c626c7b 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -142,22 +142,11 @@ void SYCL::impl_static_fence(const std::string& name) { } void SYCL::impl_initialize(InitializationSettings const& settings) { - std::vector gpu_devices = Impl::get_sycl_devices(); - - // If the device id is not specified and there are no GPUs, sidestep Kokkos - // device selection and use whatever is available (if no GPU architecture is - // specified). -#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if (!settings.has_device_id() && gpu_devices.empty()) { - Impl::SYCLInternal::singleton().initialize(sycl::device()); - Impl::SYCLInternal::m_syclDev = 0; - return; - } -#endif const auto& visible_devices = ::Kokkos::Impl::get_visible_devices(); const auto id = ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]); - Impl::SYCLInternal::singleton().initialize(gpu_devices[id]); + std::vector sycl_devices = Impl::get_sycl_devices(); + Impl::SYCLInternal::singleton().initialize(sycl_devices[id]); Impl::SYCLInternal::m_syclDev = id; } @@ -251,10 +240,10 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, namespace Impl { std::vector get_sycl_devices() { - std::vector gpu_devices = - sycl::device::get_devices(sycl::info::device_type::gpu); #if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) || \ defined(KOKKOS_ARCH_AMD_GPU) + std::vector devices = + sycl::device::get_devices(sycl::info::device_type::gpu); #if defined(KOKKOS_ARCH_INTEL_GPU) sycl::backend backend = sycl::backend::ext_oneapi_level_zero; #elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) @@ -262,13 +251,15 @@ std::vector get_sycl_devices() { #elif defined(KOKKOS_ARCH_AMD_GPU) sycl::backend backend = sycl::backend::ext_oneapi_hip; #endif - gpu_devices.erase(std::remove_if(gpu_devices.begin(), gpu_devices.end(), - [backend](const sycl::device& d) { - return d.get_backend() != backend; - }), - gpu_devices.end()); + devices.erase(std::remove_if(devices.begin(), devices.end(), + [backend](const sycl::device& d) { + return d.get_backend() != backend; + }), + devices.end()); +#else + std::vector devices = sycl::device::get_devices(); #endif - return gpu_devices; + return devices; } int g_sycl_space_factory_initialized = From 3611cfef33ecc29d91b709cd2037becf5cc9da95 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 12 Feb 2024 09:32:25 -0700 Subject: [PATCH 289/432] SYCL: Improve print_configuration (#6795) * SYCL: Improve print_configuration * Account for all device types * Reorder output --- core/src/SYCL/Kokkos_SYCL.cpp | 38 +++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index a4c844fbed4..cd9b00a95dd 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -88,12 +88,13 @@ bool SYCL::impl_is_initialized() { void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } void SYCL::print_configuration(std::ostream& os, bool verbose) const { - os << "Devices:\n"; - os << " KOKKOS_ENABLE_SYCL: yes\n"; - os << "\nRuntime Configuration:\n"; - os << "macro KOKKOS_ENABLE_SYCL : defined\n"; +#ifdef KOKKOS_ENABLE_ONEDPL + os << "macro KOKKOS_ENABLE_ONEDPL : defined\n"; +#else + os << "macro KOKKOS_ENABLE_ONEDPL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n"; #else @@ -104,15 +105,40 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n"; #endif - #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif - if (verbose) + int counter = 0; + int active_device = Kokkos::device_id(); + std::cout << "\nAvailable devices: \n"; + std::vector devices = Impl::get_sycl_devices(); + for (const auto& device : devices) { + std::string device_type; + switch (device.get_info()) { + case sycl::info::device_type::cpu: device_type = "cpu"; break; + case sycl::info::device_type::gpu: device_type = "gpu"; break; + case sycl::info::device_type::accelerator: + device_type = "accelerator"; + break; + case sycl::info::device_type::custom: device_type = "custom"; break; + case sycl::info::device_type::automatic: device_type = "automatic"; break; + case sycl::info::device_type::host: device_type = "host"; break; + case sycl::info::device_type::all: device_type = "all"; break; + } + os << "[" << device.get_backend() << "]:" << device_type << ':' << counter + << "] " << device.get_info(); + if (counter == active_device) os << " : Selected"; + os << '\n'; + ++counter; + } + + if (verbose) { + os << '\n'; SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device()); + } } void SYCL::fence(const std::string& name) const { From 4b84ae0e6347521cf505630013151fbd42950579 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Mon, 12 Feb 2024 15:54:44 -0700 Subject: [PATCH 290/432] Add OpenSSF scorecard workflow --- .github/workflows/scorecard.yml | 72 +++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 .github/workflows/scorecard.yml diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 00000000000..0c209aa2fc6 --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,72 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + - cron: '29 3 * * 1' + push: + branches: [ "master", "develop" ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@e38b1902ae4f44df626f11ba0734b14fb91f8f86 # v2.1.2 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard. + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@17573ee1cc1b9d061760f3a006fc4aac4f944fd5 # v2.2.4 + with: + sarif_file: results.sarif From 54c2336c58a022b8fa98979588dc80674b787d28 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Mon, 12 Feb 2024 16:45:49 -0700 Subject: [PATCH 291/432] Update workflow permissions --- .github/workflows/clang-format-check.yml | 4 ++++ .github/workflows/continuous-integration-workflow-32bit.yml | 2 ++ .github/workflows/continuous-integration-workflow-hpx.yml | 2 ++ .github/workflows/continuous-integration-workflow.yml | 2 ++ .github/workflows/osx.yml | 2 ++ .github/workflows/performance-benchmark.yml | 2 ++ .github/workflows/windows.yml | 3 +-- 7 files changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 1f557dbfcdf..01b9ec0ba0c 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,5 +1,9 @@ name: clang-format check + on: [push, pull_request] + +permissions: read-all + jobs: formatting-check: runs-on: ubuntu-latest diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 68fbdbe8a47..c58e1407dc9 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -9,6 +9,8 @@ on: - '**/*.md' types: [ opened, reopened, synchronize ] +permissions: read-all + concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index 187c4e61ec3..06500a6bb6b 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -13,6 +13,8 @@ concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: hpx: name: hpx diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 645cdce83cf..a1d5122eaf2 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -13,6 +13,8 @@ concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: CI: continue-on-error: true diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 85b079e56c8..93524549e3e 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -13,6 +13,8 @@ concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: osxci: name: osx-ci diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 59eed4f6096..b46b2cb26b3 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -8,6 +8,8 @@ on: - '**/*.md' types: [ opened, reopened, synchronize ] +permissions: read-all + jobs: CI: continue-on-error: true diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e66c6cb99df..cff3e05ad7b 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -8,8 +8,7 @@ concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} -permissions: - contents: read +permissions: read-all jobs: windows-cuda: From 513d8db0593dfebcdbc06bc8a388ebb840fe61e1 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Tue, 13 Feb 2024 18:09:29 +0100 Subject: [PATCH 292/432] fix constness for views --- algorithms/src/std_algorithms/Kokkos_Copy.hpp | 6 +++--- .../src/std_algorithms/Kokkos_CopyBackward.hpp | 6 +++--- algorithms/src/std_algorithms/Kokkos_CopyIf.hpp | 8 +++++--- algorithms/src/std_algorithms/Kokkos_CopyN.hpp | 6 +++--- algorithms/src/std_algorithms/Kokkos_Equal.hpp | 12 ++++++------ .../Kokkos_LexicographicalCompare.hpp | 15 +++++++++------ algorithms/src/std_algorithms/Kokkos_Move.hpp | 6 +++--- .../src/std_algorithms/Kokkos_MoveBackward.hpp | 6 +++--- .../src/std_algorithms/Kokkos_ReverseCopy.hpp | 6 +++--- .../src/std_algorithms/Kokkos_SwapRanges.hpp | 6 +++--- .../src/std_algorithms/Kokkos_Transform.hpp | 13 +++++++------ 11 files changed, 48 insertions(+), 42 deletions(-) diff --git a/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/algorithms/src/std_algorithms/Kokkos_Copy.hpp index b7ce1ba5edb..c5406c72b0d 100644 --- a/algorithms/src/std_algorithms/Kokkos_Copy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Copy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp index 8f9e0f19b80..82071a9362e 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp index ba18bc76b93..599fde5737a 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp @@ -54,7 +54,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -69,7 +70,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -96,7 +98,7 @@ template & source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp index 43c91204837..637d8d4cbc5 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp @@ -51,7 +51,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/algorithms/src/std_algorithms/Kokkos_Equal.hpp index a72a49cc22b..593c42f87e1 100644 --- a/algorithms/src/std_algorithms/Kokkos_Equal.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Equal.hpp @@ -80,7 +80,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -96,7 +96,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -111,7 +111,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -128,7 +128,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -227,7 +227,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -243,7 +243,7 @@ template & view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp index 4b5c69df451..e13479c370b 100644 --- a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp +++ b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp @@ -54,7 +54,7 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -71,7 +71,7 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -112,7 +112,8 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -129,7 +130,8 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -161,7 +163,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -187,7 +189,8 @@ template & view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/algorithms/src/std_algorithms/Kokkos_Move.hpp b/algorithms/src/std_algorithms/Kokkos_Move.hpp index f04ea12ba88..ac308ea1845 100644 --- a/algorithms/src/std_algorithms/Kokkos_Move.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Move.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp index 375474ca57f..2789ab21796 100644 --- a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp +++ b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp @@ -41,7 +41,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp index 37336c983ab..66f39c4eaa6 100644 --- a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp index 39f33b64879..d66763d304c 100644 --- a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp +++ b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp @@ -40,7 +40,7 @@ template , int> = 0> auto swap_ranges(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template , int> = 0> auto swap_ranges(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/algorithms/src/std_algorithms/Kokkos_Transform.hpp index 838c9169e25..84cbed524d3 100644 --- a/algorithms/src/std_algorithms/Kokkos_Transform.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Transform.hpp @@ -58,7 +58,7 @@ template , int> = 0> auto transform(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -73,7 +73,7 @@ template , int> = 0> auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -119,7 +119,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -137,7 +137,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -174,7 +174,8 @@ template & source, - ::Kokkos::View& dest, UnaryOperation unary_op) { + const ::Kokkos::View& dest, + UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -207,7 +208,7 @@ KOKKOS_FUNCTION auto transform( const TeamHandleType& teamHandle, const ::Kokkos::View& source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); From c95f9542f8d117a3462f60c1200ed8de75a930e9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 13 Feb 2024 12:57:18 -0500 Subject: [PATCH 293/432] Fix fence in Kokkos::sort when using std::sort --- algorithms/src/sorting/Kokkos_SortPublicAPI.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index a763c41e580..308e9e3a008 100644 --- a/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -29,7 +29,7 @@ namespace Kokkos { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view) { // constraints using ViewType = Kokkos::View; @@ -52,6 +52,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort without comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last); @@ -82,7 +83,7 @@ void sort(const Kokkos::View& view) { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view, const ComparatorType& comparator) { // constraints @@ -105,6 +106,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort with comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last, comparator); From 2a8ac6f48a467e91b0abce5edf22b34fbfe50e4f Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Tue, 13 Feb 2024 11:50:54 -0700 Subject: [PATCH 294/432] Adding SECURITY.md file This is based on the one from OSSF Scorecard --- SECURITY.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000000..93cf6e3663e --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,12 @@ +# Reporting Security Issues + +To report a security issue, please email +[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov) +and [crtrott@sandia.gov](mailto:crtrott@sandia.gov) +with a description of the issue, the steps you took to create the issue, +affected versions, and, if known, mitigations for the issue. + +Our vulnerability management team will respond within 5 working days of your +email. If the issue is confirmed as a vulnerability, we will open a +Security Advisory and acknowledge your contributions as part of it. This project +follows a 90 day disclosure timeline. From bd9db15621326f8635aca1a227dfb7acc1023efd Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 13 Feb 2024 14:12:55 -0500 Subject: [PATCH 295/432] [ci skip] Update license badge and links in the README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a5b9811cec9..19793bb82d9 100644 --- a/README.md +++ b/README.md @@ -48,10 +48,10 @@ Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citati # License -[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html) Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. -The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or -[here](https://github.com/kokkos/kokkos/blob/master/LICENSE). +The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or +[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE). From a1199b3df109f34021ca26bfda78e2b0cda4aa4f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 13 Feb 2024 20:23:10 -0700 Subject: [PATCH 296/432] Explicity pass template params to ZeroMemset for intel icpc compilers (#6807) * Explicity pass template params to ZeroMemset for intel icpc compilers Avoid apparent CTAD issue with older icpc intel compilers Resolves issue #6775 * Apply clang-format * Fix typo * Apply ZeroMemset changes consistently to other header * Apply clang-format * Address PR review comments * Make compiler happy * Apply clang-format --- core/src/Kokkos_CopyViews.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 9b379a092bd..08f6ba8d696 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -1359,7 +1359,11 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - ZeroMemset(exec_space, dst); + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset>(exec_space, dst); else contiguous_fill(exec_space, dst, value); } @@ -1391,7 +1395,11 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset(exec, dst); + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset(exec, dst); else #endif contiguous_fill(exec, dst, value); From 48588d08b3f69d4aad63e1bc8ec6b334d0faf059 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 14 Feb 2024 15:41:12 -0500 Subject: [PATCH 297/432] Add CodeQL GitHub Action (#6818) * Add CodeQL GitHub Action * Only run on push * Also build examples and benchmarks * Also run on this pull request * User permissions: read-all * Drop running tests and compiler warnings --- .github/workflows/codeql.yml | 51 ++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000..2ed86a14751 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,51 @@ +name: "CodeQL" + +on: + push: + branches: [ "master", "develop", "release-*" ] + pull_request: + branches: [ "develop" ] + +permissions: read-all + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + timeout-minutes: 360 + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: c-cpp + + - name: configure + run: + cmake -B build . + -DKokkos_ENABLE_OPENMP=ON + -DCMAKE_CXX_STANDARD=17 + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF + -DKokkos_ENABLE_TESTS=ON + -DKokkos_ENABLE_EXAMPLES=ON + -DKokkos_ENABLE_BENCHMARKS=ON + -DCMAKE_BUILD_TYPE=Debug + - name: build + run: + cmake --build build --parallel 2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:c-cpp" From df68761f97725144a6ad612c8b44186685e55cf2 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 16 Feb 2024 15:21:38 +0000 Subject: [PATCH 298/432] SYCL CI: Avoid setvars.sh --- .jenkins | 5 ++--- scripts/docker/Dockerfile.sycl | 9 +++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.jenkins b/.jenkins index 87053cbe833..5a859420fd3 100644 --- a/.jenkins +++ b/.jenkins @@ -107,12 +107,11 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ - rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ + -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 714461bfe6a..87864da1bf7 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -55,3 +55,12 @@ RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_one chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \ ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \ rm l_oneDPL_p_2022.0.0.25335.sh + +# clang++ +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin-llvm/:$PATH +# sycl-ls, icpx +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin/:$PATH +# libsycl +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/lib:$LD_LIBRARY_PATH +# libsvml +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH From c3f0a2698327fbbc25a1f26674b8f60ae9eb701b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 16 Feb 2024 14:10:53 -0500 Subject: [PATCH 299/432] Cleanup KOKKOS_CONFIGURE_CORE --- Makefile.kokkos | 3 +-- cmake/kokkos_tribits.cmake | 15 +++------------ core/src/Kokkos_Core.hpp | 3 --- 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index e8d4cf752c1..705333e426f 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1224,7 +1224,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") @@ -1465,7 +1464,7 @@ include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ - KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/cmake/kokkos_tribits.cmake b/cmake/kokkos_tribits.cmake index b30ca70ab95..060a7a8472c 100644 --- a/cmake/kokkos_tribits.cmake +++ b/cmake/kokkos_tribits.cmake @@ -237,18 +237,10 @@ ENDMACRO() ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp MACRO(KOKKOS_CONFIGURE_CORE) - SET(FWD_BACKEND_LIST) - FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) - LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) - ENDFOREACH() - FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) - LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) - ENDFOREACH() - MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") + MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") @@ -309,7 +301,6 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index f75f9069c31..1f146563be2 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -302,9 +302,6 @@ std::vector partition_space(ExecSpace const& space, // implementation of the RAII wrapper is using Kokkos::single. #include -// Specializations required after core definitions -#include - //---------------------------------------------------------------------------- // Redefinition of the macros min and max if we pushed them at entry of // Kokkos_Core.hpp From 1112e07ebc3194355a7ae9766a91f54d2759f059 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 16 Feb 2024 15:09:24 -0500 Subject: [PATCH 300/432] Update GitHub actions ot use Node 20 --- .github/workflows/clang-format-check.yml | 2 +- .github/workflows/continuous-integration-workflow-32bit.yml | 2 +- .github/workflows/continuous-integration-workflow-hpx.yml | 6 +++--- .github/workflows/continuous-integration-workflow.yml | 6 +++--- .github/workflows/osx.yml | 2 +- .github/workflows/performance-benchmark.yml | 4 ++-- .github/workflows/windows.yml | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 1f557dbfcdf..cc25981f9c7 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -4,7 +4,7 @@ jobs: formatting-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Run clang-format style check. uses: DoozyX/clang-format-lint-action@v0.16.2 with: diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 68fbdbe8a47..26bc9e02d27 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -21,7 +21,7 @@ jobs: image: ghcr.io/kokkos/ci-containers/ubuntu:latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: install_multilib run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib gfortran-multilib - name: Configure Kokkos diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index 187c4e61ec3..bf2cfdfef9a 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -20,7 +20,7 @@ jobs: steps: - name: checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: kokkos - name: setup hpx dependencies @@ -33,12 +33,12 @@ jobs: libboost-all-dev \ ninja-build - name: checkout hpx - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: STELLAR-GROUP/hpx ref: v1.9.0 path: hpx - - uses: actions/cache@v3 + - uses: actions/cache@v4 id: cache-hpx with: path: ./hpx/install diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 645cdce83cf..636e2ef45b2 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -67,7 +67,7 @@ jobs: image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} steps: - name: Checkout desul - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: desul/desul ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b @@ -82,8 +82,8 @@ jobs: cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install .. sudo cmake --build . --target install --parallel 2 - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@v4 + - uses: actions/cache@v4 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}-${{ github.sha }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 85b079e56c8..36d673f383e 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -31,7 +31,7 @@ jobs: cmake_build_type: "Release" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: configure run: cmake -B build . diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 59eed4f6096..f8fcd59ee7c 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -23,8 +23,8 @@ jobs: BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }} steps: - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@v4 + - uses: actions/cache@v4 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }} diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e66c6cb99df..424f99547d3 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -18,7 +18,7 @@ jobs: runs-on: windows-2022 steps: - - uses: Jimver/cuda-toolkit@v0.2.11 + - uses: Jimver/cuda-toolkit@v0.2.14 id: cuda-toolkit with: cuda: '12.1.0' From 361bdbf49e0093945950e1450c296b7906d76c0b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 30 Jan 2024 16:11:58 -0700 Subject: [PATCH 301/432] [4.2.01]: changelog update (#6656) * [ci skip] [4.2.01]: changelog update * [ci skip] [4.2.01]: changelog update * Update CHANGELOG.md Co-authored-by: Damien L-G * Update CHANGELOG.md Co-authored-by: Damien L-G * Update CHANGELOG.md Co-authored-by: Damien L-G * Update CHANGELOG.md Address review comments - Add missing item for MSVC CUDA build fixes - Drop item unnecessary for changelog * Update changelog * Update changelog * [ci skip] Update patch version for 4.2.01 * Apply suggestions from code review --------- Co-authored-by: Nathan Ellingwood Co-authored-by: Damien L-G --- CHANGELOG.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 92bb6fdbe5c..e1d06958295 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,26 @@ # CHANGELOG +## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Add warp sync for `parallel_reduce` to avoid race condition [\#6630](https://github.com/kokkos/kokkos/pull/6630), [\#6746](https://github.com/kokkos/kokkos/pull/6746) + +#### HIP: +- Fix Graph "multiple definition of" linking error (missing `inline` specifier) [\#6624](https://github.com/kokkos/kokkos/pull/6624) +- Add support for gfx940 (AMD Instinct MI300 GPU) [\#6671](https://github.com/kokkos/kokkos/pull/6671) + +### Build System +- CMake: Don't let Kokkos set `CMAKE_CXX_FLAGS` for Trilinos builds [\#6742](https://github.com/kokkos/kokkos/pull/6742) + +### Bug Fixes +- Remove deprecation warning for `AllocationMechanism` for GCC <11.0 [\#6653](https://github.com/kokkos/kokkos/pull/6653) +- Fix bug early tools finalize with non-default host execution instances [\#6635](https://github.com/kokkos/kokkos/pull/6635) +- Fix various issues for MSVC CUDA builds [\#6659](https://github.com/kokkos/kokkos/pull/6659) +- Fix "extra `;`" warning with `-pedantic` flag in `` [\#6510](https://github.com/kokkos/kokkos/pull/6510) + ## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00) From 9d33cb7725943e048e150f7c9c58473ba98622e5 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 20 Feb 2024 23:16:21 -0500 Subject: [PATCH 302/432] Clean up shift_{right, left}_team_impl (#6821) * Clean up shift_right_team_impl * Update shift_left_team_impl for consistency --- .../std_algorithms/impl/Kokkos_ShiftLeft.hpp | 5 ++-- .../std_algorithms/impl/Kokkos_ShiftRight.hpp | 25 +++---------------- 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp index 50bc7c8d610..94147485071 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp @@ -126,10 +126,11 @@ KOKKOS_FUNCTION IteratorType shift_left_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first + n, last); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { first[i] = std::move(first[i + n]); } }); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp index cac20bfbba6..0414e6f1c25 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp @@ -103,26 +103,6 @@ IteratorType shift_right_exespace_impl( return first + n; } -template -struct StdShiftRightTeamSingleFunctor { - Iterator m_first; - Iterator m_last; - std::size_t m_shift; - - KOKKOS_FUNCTION - void operator()() const { - // the impl function calling this functor guarantees that - // - m_shift is non-negative - // - m_first, m_last identify a valid range with m_last > m_first - // - m_shift is less than m_last - m_first - // so I can safely use std::size_t here - } - - KOKKOS_FUNCTION - StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n) - : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {} -}; - template KOKKOS_FUNCTION IteratorType shift_right_team_impl( const TeamHandleType& teamHandle, IteratorType first, IteratorType last, @@ -145,10 +125,11 @@ KOKKOS_FUNCTION IteratorType shift_right_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first, last - n); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { last[-i - 1] = std::move(last[-n - i - 1]); } }); From e2c810e1f213a09bce152eba623b7f275c035ec8 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 22 Feb 2024 08:26:53 -0500 Subject: [PATCH 303/432] Avoid detecting unwanted fences in the parallel_scan_no_fence test --- core/unit_test/tools/TestEventCorrectness.hpp | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/core/unit_test/tools/TestEventCorrectness.hpp b/core/unit_test/tools/TestEventCorrectness.hpp index 3c85f661aae..946169a786d 100644 --- a/core/unit_test/tools/TestEventCorrectness.hpp +++ b/core/unit_test/tools/TestEventCorrectness.hpp @@ -409,14 +409,19 @@ TEST(kokkosp, parallel_scan_no_fence) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); auto success = validate_absence( - [=]() { - TestScanFunctor tf; - Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); - }, + [=]() { Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); }, [=](BeginFenceEvent begin_event) { if (begin_event.name.find("Debug Only Check for Execution Error") != std::string::npos || @@ -450,13 +455,20 @@ TEST(kokkosp, parallel_scan_no_fence_view) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::View v("scan_result"); + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); - Kokkos::View v("scan_result"); auto success = validate_absence( [=]() { - TestScanFunctor tf; Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); }, [=](BeginFenceEvent begin_event) { From 24f251a854b36c13804e1ac850e8ae2a9e2bcdac Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 27 Feb 2024 11:29:44 -0500 Subject: [PATCH 304/432] Add test for current CTAD support with RangePolicy (#6803) * Add test for current CTAD support with RangePolicy Co-authored-by: Nevin Liber * Rework CTAD test to avoid "memeber was declared but never referenced" warnings with icpc 19 * Attempt to fix CI * Attempt to eliminate maybe unused warning in icpc * Disable CTAD tests for nvcc < 11.2 as compiler bugs prevent CTAD expressions inside decltype --------- Co-authored-by: Nevin Liber Co-authored-by: Nevin ":-)" Liber --- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestRangePolicyCTAD.cpp | 87 ++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 core/unit_test/TestRangePolicyCTAD.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 28514d2d7c1..4fc7e841fc7 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -86,6 +86,7 @@ SET(COMPILE_ONLY_SOURCES TestDetectionIdiom.cpp TestBitManipulation.cpp TestInterOp.cpp + TestRangePolicyCTAD.cpp TestStringManipulation.cpp TestVersionMacros.cpp TestViewRank.cpp diff --git a/core/unit_test/TestRangePolicyCTAD.cpp b/core/unit_test/TestRangePolicyCTAD.cpp new file mode 100644 index 00000000000..d554c33bd88 --- /dev/null +++ b/core/unit_test/TestRangePolicyCTAD.cpp @@ -0,0 +1,87 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "Kokkos_Core_fwd.hpp" + +namespace { + +template +using PolicyMaker = decltype(::Kokkos::RangePolicy(std::declval()...)); + +template +inline constexpr bool IsSamePolicy = + std::is_same_v>; + +#define KOKKOS_TEST_RANGE_POLICY(...) static_assert(IsSamePolicy<__VA_ARGS__>) + +struct TestRangePolicyCTAD { + struct ImplicitlyConvertibleToDefaultExecutionSpace { + operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + using des = Kokkos::DefaultExecutionSpace; + using nes = ImplicitlyConvertibleToDefaultExecutionSpace; + using i64 = int64_t; + using i32 = int32_t; + using cs = Kokkos::ChunkSize; + + // RangePolicy() + + // Guard against GGC 8.4 bug + // error: cannot deduce template arguments for ā€˜RangePolicyā€™ from () + // error: template argument 2 is invalid +#if !defined(KOKKOS_COMPILER_GNU) || (KOKKOS_COMPILER_GNU > 900) + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<> /*, no argument */); +#endif + + // RangePolicy(index_type, index_type) + +#if !defined(KOKKOS_COMPILER_NVCC) || KOKKOS_COMPILER_NVCC >= 1120 + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i64); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i32); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i64); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i32); + + // RangePolicy(index_type, index_type, Args...) + + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i64, cs); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i32, cs); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i64, cs); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i32, cs); + + // RangePolicy(execution_space, index_type, index_type) + + // none (ambiguous deduction for template arguments) + + // RangePolicy(execution_space, index_type, index_type, Args...) + + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i64, i64, cs); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i32, i32, cs); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i64, i64, cs); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i32, i32, cs); +#endif +}; // TestRangePolicyCTAD struct + +// To eliminate maybe_unused warning on some compilers +const Kokkos::DefaultExecutionSpace des = + TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); + +} // namespace From 16a5ebe95e88bf103ff7459ca1b1ce9983426eed Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 28 Feb 2024 09:42:23 -0500 Subject: [PATCH 305/432] multi-GPU support: Add test for all policies (#6782) * Cuda multi-GPU support: Test with managed and unmanaged Views * Move check for cuda_device * Also test copying between devices * Refactor using StreamsAndDevices * Don't use shared_ptr --- core/unit_test/CMakeLists.txt | 2 +- .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 160 +++++++++++++++--- 2 files changed, 142 insertions(+), 20 deletions(-) diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 4fc7e841fc7..5e56c45b8ff 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -789,7 +789,7 @@ if(Kokkos_ENABLE_CUDA) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_CudaInterOpStreamsMultiGPU SOURCES - UnitTestMain.cpp + UnitTestMainInit.cpp cuda/TestCuda_InterOp_StreamsMultiGPU.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp index 8a8270c7f93..5fec4020921 100644 --- a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -18,32 +18,154 @@ #include namespace { -TEST(cuda, multi_gpu) { - Kokkos::initialize(); - int n_devices; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices)); +struct StreamsAndDevices { + std::array streams; + std::array devices; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(0)); - cudaStream_t stream0; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream0)); + StreamsAndDevices() { + int n_devices; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(n_devices - 1)); - cudaStream_t stream; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream)); + devices = {0, n_devices - 1}; + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&streams[i])); + } + } + StreamsAndDevices(const StreamsAndDevices &) = delete; + StreamsAndDevices &operator=(const StreamsAndDevices &) = delete; + ~StreamsAndDevices() { + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(streams[i])); + } + } +}; + +std::array get_execution_spaces( + const StreamsAndDevices &streams_and_devices) { + TEST_EXECSPACE exec0(streams_and_devices.streams[0]); + TEST_EXECSPACE exec1(streams_and_devices.streams[1]); + + // Must return void to use ASSERT_EQ + [&]() { + ASSERT_EQ(exec0.cuda_device(), streams_and_devices.devices[0]); + ASSERT_EQ(exec1.cuda_device(), streams_and_devices.devices[1]); + }(); + + return {exec0, exec1}; +} + +// Test Interoperability with Cuda Streams +void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, + TEST_EXECSPACE exec, Kokkos::View v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0", + Kokkos::RangePolicy(exec0, 0, 100), + Test::FunctorRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", + Kokkos::RangePolicy(exec, 0, 100), + Test::FunctorRange(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce_0", + Kokkos::RangePolicy>(exec0, + 0, 100), + Test::FunctorRangeReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce", + Kokkos::RangePolicy>(exec, 0, + 100), + Test::FunctorRangeReduce(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0", + Kokkos::MDRangePolicy>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", + Kokkos::MDRangePolicy>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange(v)); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v0), sum0); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy(exec0, 10, 10), + Test::FunctorTeam(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy(exec, 10, 10), + Test::FunctorTeam(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy>(exec0, + 10, 10), + Test::FunctorTeamReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy>(exec, 10, + 10), + Test::FunctorTeamReduce(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +TEST(cuda_multi_gpu, managed_views) { + StreamsAndDevices streams_and_devices; { - TEST_EXECSPACE space0(stream0); - ASSERT_EQ(space0.cuda_device(), 0); - TEST_EXECSPACE space(stream); - ASSERT_EQ(space.cuda_device(), n_devices - 1); + std::array execs = + get_execution_spaces(streams_and_devices); + + Kokkos::View view0( + Kokkos::view_alloc("v0", execs[0]), 100); + Kokkos::View view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); } - Kokkos::finalize(); +} + +TEST(cuda_multi_gpu, unmanaged_views) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(0)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[0].cuda_device())); + int *p0; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p0), sizeof(int) * 100)); + Kokkos::View view0(p0, 100); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(n_devices - 1)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[1].cuda_device())); + int *p; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p), sizeof(int) * 100)); + Kokkos::View view(p, 100); + + test_policies(execs[0], view0, execs[1], view); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p)); + } } } // namespace From c90a9c6f7bf439b4e224a0eaabcc2af815980dc5 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Wed, 28 Feb 2024 15:45:34 -0500 Subject: [PATCH 306/432] Implement sort_by_key (#6801) * Implement sort_by_key * Address review comments * Make passed in view const ref * Fix _via_sort for OpenMPTarget Co-authored-by: Daniel Arndt * Rip out ROCThrust for now * Few changes to address the comments * Fix SYCL * Fix SYCL implementation --------- Co-authored-by: Daniel Arndt --- algorithms/src/Kokkos_Sort.hpp | 1 + .../src/sorting/Kokkos_SortByKeyPublicAPI.hpp | 117 +++++++ .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 312 ++++++++++++++++++ algorithms/unit_tests/CMakeLists.txt | 1 + algorithms/unit_tests/TestSortByKey.hpp | 241 ++++++++++++++ 5 files changed, 672 insertions(+) create mode 100644 algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp create mode 100644 algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp create mode 100644 algorithms/unit_tests/TestSortByKey.hpp diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index f77484cc555..136b4ec82dc 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -23,6 +23,7 @@ #include "sorting/Kokkos_BinSortPublicAPI.hpp" #include "sorting/Kokkos_SortPublicAPI.hpp" +#include "sorting/Kokkos_SortByKeyPublicAPI.hpp" #include "sorting/Kokkos_NestedSortPublicAPI.hpp" #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT diff --git a/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp new file mode 100644 index 00000000000..fc73eccad68 --- /dev/null +++ b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ +#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ + +#include "./impl/Kokkos_SortByKeyImpl.hpp" +#include +#include + +namespace Kokkos::Experimental { + +// --------------------------------------------------------------- +// basic overloads +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys, + values); +} + +// --------------------------------------------------------------- +// overloads supporting a custom comparator +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values, + comparator); +} + +} // namespace Kokkos::Experimental +#endif diff --git a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp new file mode 100644 index 00000000000..698d059f939 --- /dev/null +++ b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -0,0 +1,312 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ +#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ + +#include + +#if defined(KOKKOS_ENABLE_CUDA) + +// Workaround for `Instruction 'shfl' without '.sync' is not supported on +// .target sm_70 and higher from PTX ISA version 6.4`. +// Also see https://github.com/NVIDIA/cub/pull/170. +#if !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" + +#if defined(KOKKOS_COMPILER_CLANG) +// Some versions of Clang fail to compile Thrust, failing with errors like +// this: +// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// The exact combination of versions for Clang and Thrust (or CUDA) for this +// failure was not investigated, however even very recent version combination +// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. +// +// Defining _CubLog here locally allows us to avoid that code path, however +// disabling some debugging diagnostics +#pragma push_macro("_CubLog") +#ifdef _CubLog +#undef _CubLog +#endif +#define _CubLog +#include +#include +#pragma pop_macro("_CubLog") +#else +#include +#include +#endif + +#pragma GCC diagnostic pop + +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) && \ + (ONEDPL_VERSION_MAJOR > 2022 || \ + (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)) +#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY +#include +#include +#endif + +namespace Kokkos::Impl { + +template +constexpr inline bool is_admissible_to_kokkos_sort_by_key = + ::Kokkos::is_view::value&& T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value); + +template +KOKKOS_INLINE_FUNCTION constexpr void +static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) { + static_assert(is_admissible_to_kokkos_sort_by_key, + "Kokkos::sort_by_key only accepts 1D values View with " + "LayoutRight, LayoutLeft or LayoutStride."); +} + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_cudathrust( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::cuda::par.on(exec.cuda_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY +template +void sort_by_key_onedpl( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + if (keys.stride(0) != 1 && values.stride(0) != 1) { + Kokkos::abort( + "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1."); + } + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + const int n = keys.extent(0); + oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(), + std::forward(maybeComparator)...); +} +#endif + +template +void applyPermutation(const ExecutionSpace& space, + const PermutationView& permutation, + const ViewType& view) { + static_assert(std::is_integral::value); + + auto view_copy = Kokkos::create_mirror( + Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, + Kokkos::WithoutInitializing), + view); + Kokkos::deep_copy(space, view_copy, view); + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::permute_" + view.label(), + Kokkos::RangePolicy(space, 0, view.extent(0)), + KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); }); +} + +template +void sort_by_key_via_sort( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + auto const n = keys.size(); + + Kokkos::View permute( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "Kokkos::sort_by_key_via_sort::permute"), + n); + + // iota + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::iota", + Kokkos::RangePolicy(exec, 0, n), + KOKKOS_LAMBDA(int i) { permute(i) = i; }); + +// FIXME OPENMPTARGET The sort happens on the host so we have to copy keys there +#ifdef KOKKOS_ENABLE_OPENMPTARGET + auto keys_in_comparator = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + keys); + Kokkos::deep_copy(exec, keys_in_comparator, keys); +#else + auto keys_in_comparator = keys; +#endif + + static_assert(sizeof...(MaybeComparator) <= 1); + if constexpr (sizeof...(MaybeComparator) == 0) { +#ifdef KOKKOS_ENABLE_SYCL + auto* raw_keys_in_comparator = keys_in_comparator.data(); + auto stride = keys_in_comparator.stride(0); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return raw_keys_in_comparator[i * stride] < + raw_keys_in_comparator[j * stride]; + }); +#else + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_in_comparator(i) < keys_in_comparator(j); + }); +#endif + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); +#ifdef KOKKOS_ENABLE_SYCL + auto* raw_keys_in_comparator = keys_in_comparator.data(); + auto stride = keys_in_comparator.stride(0); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(raw_keys_in_comparator[i * stride], + raw_keys_in_comparator[j * stride]); + }); +#else + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(keys_in_comparator(i), keys_in_comparator(j)); + }); +#endif + } + + applyPermutation(exec, permute, keys); + applyPermutation(exec, permute, values); +} + +// ------------------------------------------------------ +// +// specialize cases for sorting by key without comparator +// +// ------------------------------------------------------ + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_cudathrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values); + else +#endif + sort_by_key_via_sort(exec, keys, values); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_without_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_via_sort(exec, keys, values); +} + +// --------------------------------------------------- +// +// specialize cases for sorting by key with comparator +// +// --------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_cudathrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values, comparator); + else +#endif + sort_by_key_via_sort(exec, keys, values, comparator); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_with_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_via_sort(exec, keys, values, comparator); +} + +#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + +} // namespace Kokkos::Impl +#endif diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt index 7d5d0c67652..db184bc8a99 100644 --- a/algorithms/unit_tests/CMakeLists.txt +++ b/algorithms/unit_tests/CMakeLists.txt @@ -25,6 +25,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) set(ALGO_SORT_SOURCES) foreach(SOURCE_Input TestSort + TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB diff --git a/algorithms/unit_tests/TestSortByKey.hpp b/algorithms/unit_tests/TestSortByKey.hpp new file mode 100644 index 00000000000..16f68eaaf26 --- /dev/null +++ b/algorithms/unit_tests/TestSortByKey.hpp @@ -0,0 +1,241 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP + +#include +#include +#include +#include + +#include // pair + +namespace Test { +namespace SortImpl { + +struct Less { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs < rhs; + } +}; + +struct Greater { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs > rhs; + } +}; + +template +struct is_sorted_by_key_struct { + Keys keys; + Keys keys_orig; + Permute permute; + Comparator comparator; + + is_sorted_by_key_struct(Keys keys_, Keys keys_orig_, Permute permute_, + Comparator comparator_ = Comparator{}) + : keys(keys_), + keys_orig(keys_orig_), + permute(permute_), + comparator(comparator_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, unsigned int &count) const { + if (i < keys.extent_int(0) - 1 && comparator(keys(i + 1), keys(i))) ++count; + if (keys(i) != keys_orig(permute(i))) ++count; + } +}; + +template +void iota(ExecutionSpace const &space, ViewType const &v, + typename ViewType::value_type value = 0) { + using ValueType = typename ViewType::value_type; + Kokkos::parallel_for( + "ArborX::Algorithms::iota", + Kokkos::RangePolicy(space, 0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; }); +} + +} // namespace SortImpl + +TEST(TEST_CATEGORY, SortByKeyEmptyView) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 0); + Kokkos::View values("values", 0); + + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); +} + +TEST(TEST_CATEGORY, SortByKey) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct(keys, keys_orig, + permute), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyWithComparator) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + SortImpl::Greater comparator; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute, comparator); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys, keys_orig, permute, comparator), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyStaticExtents) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + Kokkos::View keys("keys"); + + Kokkos::View values_static("values_static"); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_static)); + + Kokkos::View values_dynamic("values_dynamic", 10); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); +} + +template +void buildViewsForStrided(ExecutionSpace const &space, int n, Keys &keys, + Values &values) { + Kokkos::parallel_for( + "create_data", + Kokkos::MDRangePolicy, ExecutionSpace>(space, {0, 0, 0}, + {n, n, n}), + KOKKOS_LAMBDA(int i, int j, int k) { + keys(i, j, k) = n - i; + values(i, j, k) = j; + }); +} + +TEST(TEST_CATEGORY, SortByKeyWithStrides) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + auto const n = 10; + + Kokkos::View keys("keys", n, n, n); + Kokkos::View values("values", n, n, n); + buildViewsForStrided(space, n, keys, values); + + auto keys_sub = Kokkos::subview(keys, Kokkos::ALL(), 1, 2); + auto values_sub = Kokkos::subview(values, 4, Kokkos::ALL(), 6); + + auto keys_orig = Kokkos::create_mirror(space, keys_sub); + Kokkos::deep_copy(space, keys_orig, keys_sub); + + Kokkos::Experimental::sort_by_key(space, keys_sub, values_sub); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys_sub, keys_orig, values_sub), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); +} + +TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 3); + Kokkos::View values("values", 1); + + ASSERT_DEATH( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values), + "values and keys extents must be the same"); + ASSERT_DEATH(Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values, + SortImpl::Greater{}), + "values and keys extents must be the same"); +} + +} // namespace Test +#endif From 058c3a08e6b871f2ddfe3774f390ee473525df5e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 28 Feb 2024 17:52:40 -0500 Subject: [PATCH 307/432] Fix scorecard workflow (#6831) * Add Scorecrad * another one * Don't run on pull requests * Add comments back in --- .github/workflows/scorecard.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 0c209aa2fc6..3d7ede20773 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -10,9 +10,10 @@ on: # To guarantee Maintained check is occasionally updated. See # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained schedule: - - cron: '29 3 * * 1' + # Weekly on Saturdays. + - cron: '30 1 * * 6' push: - branches: [ "master", "develop" ] + branches: [ master, develop ] # Declare default permissions as read only. permissions: read-all @@ -32,12 +33,12 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2 with: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@e38b1902ae4f44df626f11ba0734b14fb91f8f86 # v2.1.2 + uses: ossf/scorecard-action@80e868c13c90f172d68d1f4501dee99e2479f7af # v2.1.3 with: results_file: results.sarif results_format: sarif @@ -59,14 +60,14 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0 + uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 with: name: SARIF file path: results.sarif retention-days: 5 # Upload the results to GitHub's code scanning dashboard. - - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@17573ee1cc1b9d061760f3a006fc4aac4f944fd5 # v2.2.4 + - name: "Upload SARIF results to code scanning" + uses: github/codeql-action/upload-sarif@83f0fe6c4988d98a455712a27f0255212bba9bd4 # v2.3.6 with: sarif_file: results.sarif From 04a5334c699cb9b87293d27bc73090b3b7c13019 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 28 Feb 2024 18:15:58 -0500 Subject: [PATCH 308/432] Remove redundant RangePolicy constructor (#6841) * Remove redundant RangePolicy constructor * Disable test case for Cuda <11.2 completely --- core/src/Kokkos_ExecPolicy.hpp | 18 ------------------ core/unit_test/TestRangePolicyCTAD.cpp | 11 ++++++++--- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index 343af5bd690..feab22a93e7 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -114,24 +114,6 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity(0), m_granularity_mask(0) {} - /** \brief Total range */ - template && - std::is_convertible_v), - bool> = false> - inline RangePolicy(const typename traits::execution_space& work_space, - const IndexType1 work_begin, const IndexType2 work_end) - : m_space(work_space), - m_begin(work_begin), - m_end(work_end), - m_granularity(0), - m_granularity_mask(0) { - check_conversion_safety(work_begin); - check_conversion_safety(work_end); - check_bounds_validity(); - set_auto_chunk_size(); - } - /** \brief Total range */ template && diff --git a/core/unit_test/TestRangePolicyCTAD.cpp b/core/unit_test/TestRangePolicyCTAD.cpp index d554c33bd88..588d960d680 100644 --- a/core/unit_test/TestRangePolicyCTAD.cpp +++ b/core/unit_test/TestRangePolicyCTAD.cpp @@ -17,6 +17,8 @@ #include #include "Kokkos_Core_fwd.hpp" +#if !defined(KOKKOS_COMPILER_NVCC) || KOKKOS_COMPILER_NVCC >= 1120 + namespace { template @@ -54,7 +56,6 @@ struct TestRangePolicyCTAD { // RangePolicy(index_type, index_type) -#if !defined(KOKKOS_COMPILER_NVCC) || KOKKOS_COMPILER_NVCC >= 1120 KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i64); KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i32); KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i64); @@ -69,7 +70,10 @@ struct TestRangePolicyCTAD { // RangePolicy(execution_space, index_type, index_type) - // none (ambiguous deduction for template arguments) + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i64, i64); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i32, i32); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i64, i64); + KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i32, i32); // RangePolicy(execution_space, index_type, index_type, Args...) @@ -77,7 +81,6 @@ struct TestRangePolicyCTAD { KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i32, i32, cs); KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i64, i64, cs); KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i32, i32, cs); -#endif }; // TestRangePolicyCTAD struct // To eliminate maybe_unused warning on some compilers @@ -85,3 +88,5 @@ const Kokkos::DefaultExecutionSpace des = TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); } // namespace + +#endif From 0cdc9eb768582bec4af8f0e44057e235ca8277f0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 29 Feb 2024 17:55:55 -0500 Subject: [PATCH 309/432] Bump Google Benchmark version v1.{6.2 -> 7.1} in CMake FetchContent --- core/perf_test/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 7f3916da312..e0dba03e1ec 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -50,8 +50,8 @@ ELSE() FetchContent_Declare( googlebenchmark DOWNLOAD_EXTRACT_TIMESTAMP FALSE - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b + URL https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz + URL_HASH MD5=0459a6c530df9851bee6504c3e37c2e7 ) FetchContent_MakeAvailable(googlebenchmark) list(POP_BACK CMAKE_MESSAGE_INDENT) From 8b8de2cf47cd85005b6adafb8fe5c99b57c32c23 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Fri, 1 Mar 2024 14:43:56 -0600 Subject: [PATCH 310/432] Remove variadic range policy constructor (#6845) * Removed variadic ctor from RangePolicy, as well as extra set(...) methods. * In RangePolicy: Deprecated set(ChunkSize) In the ctor that takes a ChunkSize, set the chunk size via set_chunk_size instead of via member initializers (to be more consistent with other code) * Removed superfluous "inline" from two of the RangePolicy constructors --- core/src/Kokkos_ExecPolicy.hpp | 47 +++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index feab22a93e7..d0051560398 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -123,13 +123,12 @@ class RangePolicy : public Impl::PolicyTraits { : RangePolicy(typename traits::execution_space(), work_begin, work_end) {} /** \brief Total range */ - template && std::is_convertible_v), bool> = false> inline RangePolicy(const typename traits::execution_space& work_space, - const IndexType1 work_begin, const IndexType2 work_end, - Args... args) + const IndexType1 work_begin, const IndexType2 work_end) : m_space(work_space), m_begin(work_begin), m_end(work_end), @@ -139,7 +138,24 @@ class RangePolicy : public Impl::PolicyTraits { check_conversion_safety(work_end); check_bounds_validity(); set_auto_chunk_size(); - set(args...); + } + + template && + std::is_convertible_v), + bool> = false> + RangePolicy(const typename traits::execution_space& work_space, + const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) + : m_space(work_space), + m_begin(work_begin), + m_end(work_end), + m_granularity(0), + m_granularity_mask(0) { + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); + set_chunk_size(chunk_size.value); } /** \brief Total range */ @@ -147,28 +163,19 @@ class RangePolicy : public Impl::PolicyTraits { std::enable_if_t<(std::is_convertible_v && std::is_convertible_v), bool> = false> - inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, - Args... args) + RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) : RangePolicy(typename traits::execution_space(), work_begin, work_end, - args...) {} - - private: - inline void set() {} + chunk_size) {} public: - template - inline void set(Args...) { - static_assert( - 0 == sizeof...(Args), - "Kokkos::RangePolicy: unhandled constructor arguments encountered."); - } - - template - inline void set(const ChunkSize& chunksize, Args... args) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead") + inline void set(ChunkSize chunksize) { m_granularity = chunksize.value; m_granularity_mask = m_granularity - 1; - set(args...); } +#endif public: /** \brief return chunk_size */ From dc524910d397feb7050b39f34d38dfd7d4f82b65 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 1 Mar 2024 21:05:29 +0000 Subject: [PATCH 311/432] Avoid unused variable warning in TestRangePolicyCTAD.cpp --- core/unit_test/TestRangePolicyCTAD.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestRangePolicyCTAD.cpp b/core/unit_test/TestRangePolicyCTAD.cpp index 588d960d680..6a2c0f507be 100644 --- a/core/unit_test/TestRangePolicyCTAD.cpp +++ b/core/unit_test/TestRangePolicyCTAD.cpp @@ -84,7 +84,7 @@ struct TestRangePolicyCTAD { }; // TestRangePolicyCTAD struct // To eliminate maybe_unused warning on some compilers -const Kokkos::DefaultExecutionSpace des = +[[maybe_unused]] const Kokkos::DefaultExecutionSpace des = TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); } // namespace From 277339090e02e6555cf4f8508bd4b1f78bb6d2a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 22 Feb 2024 18:02:43 +0100 Subject: [PATCH 312/432] bytes_and_flops with CMake Fix some warnings as well --- benchmarks/CMakeLists.txt | 1 + benchmarks/bytes_and_flops/CMakeLists.txt | 4 ++++ benchmarks/bytes_and_flops/bench.hpp | 12 ++++++------ benchmarks/bytes_and_flops/bench_double.cpp | 2 +- benchmarks/bytes_and_flops/bench_float.cpp | 2 +- benchmarks/bytes_and_flops/bench_int32_t.cpp | 2 +- benchmarks/bytes_and_flops/bench_int64_t.cpp | 2 +- benchmarks/bytes_and_flops/bench_stride.hpp | 16 ++++++++-------- .../bytes_and_flops/bench_unroll_stride.hpp | 16 ++++++++-------- benchmarks/bytes_and_flops/main.cpp | 2 +- 10 files changed, 32 insertions(+), 27 deletions(-) create mode 100644 benchmarks/bytes_and_flops/CMakeLists.txt diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index bf946714d21..773544b6f32 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,3 +1,4 @@ +KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) diff --git a/benchmarks/bytes_and_flops/CMakeLists.txt b/benchmarks/bytes_and_flops/CMakeLists.txt new file mode 100644 index 00000000000..0ce44a6f1a8 --- /dev/null +++ b/benchmarks/bytes_and_flops/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + bytes_and_flops + SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp +) diff --git a/benchmarks/bytes_and_flops/bench.hpp b/benchmarks/bytes_and_flops/bench.hpp index 2589fd7309b..88830af624b 100644 --- a/benchmarks/bytes_and_flops/bench.hpp +++ b/benchmarks/bytes_and_flops/bench.hpp @@ -37,22 +37,22 @@ struct RunStride { }; #define STRIDE 1 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 2 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 4 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 8 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 16 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 32 -#include +#include "bench_stride.hpp" #undef STRIDE template diff --git a/benchmarks/bytes_and_flops/bench_double.cpp b/benchmarks/bytes_and_flops/bench_double.cpp index f955c996660..2fda1ae3d42 100644 --- a/benchmarks/bytes_and_flops/bench_double.cpp +++ b/benchmarks/bytes_and_flops/bench_double.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_float.cpp b/benchmarks/bytes_and_flops/bench_float.cpp index 137ff67d404..3210116a9ee 100644 --- a/benchmarks/bytes_and_flops/bench_float.cpp +++ b/benchmarks/bytes_and_flops/bench_float.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_int32_t.cpp b/benchmarks/bytes_and_flops/bench_int32_t.cpp index 29ccec01414..24a5dcd3899 100644 --- a/benchmarks/bytes_and_flops/bench_int32_t.cpp +++ b/benchmarks/bytes_and_flops/bench_int32_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_int64_t.cpp b/benchmarks/bytes_and_flops/bench_int64_t.cpp index c153d5eff39..0634700c31e 100644 --- a/benchmarks/bytes_and_flops/bench_int64_t.cpp +++ b/benchmarks/bytes_and_flops/bench_int64_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_stride.hpp b/benchmarks/bytes_and_flops/bench_stride.hpp index b63d486fc9e..80f017fbe8f 100644 --- a/benchmarks/bytes_and_flops/bench_stride.hpp +++ b/benchmarks/bytes_and_flops/bench_stride.hpp @@ -15,28 +15,28 @@ //@HEADER #define UNROLL 1 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL template diff --git a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 0f7a298c1bb..e2590911b20 100644 --- a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -26,7 +26,7 @@ struct Run { Kokkos::deep_copy(C, Scalar(3.5)); Kokkos::Timer timer; - for (int i = 0; i < I; ++i) { + for (int iter = 0; iter < I; ++iter) { Kokkos::parallel_for( "BenchmarkKernel", Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)), @@ -87,25 +87,25 @@ struct Run { C(n, i, 0) = a1; #endif #if (UNROLL == 2) - C(n, i, 0) = a1 + a2; + C(n, iter, 0) = a1 + a2; #endif #if (UNROLL == 3) - C(n, i, 0) = a1 + a2 + a3; + C(n, iter, 0) = a1 + a2 + a3; #endif #if (UNROLL == 4) - C(n, i, 0) = a1 + a2 + a3 + a4; + C(n, iter, 0) = a1 + a2 + a3 + a4; #endif #if (UNROLL == 5) - C(n, i, 0) = a1 + a2 + a3 + a4 + a5; + C(n, iter, 0) = a1 + a2 + a3 + a4 + a5; #endif #if (UNROLL == 6) - C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6; + C(n, iter, 0) = a1 + a2 + a3 + a4 + a5 + a6; #endif #if (UNROLL == 7) - C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7; + C(n, iter, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7; #endif #if (UNROLL == 8) - C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8; + C(n, iter, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8; #endif }); } diff --git a/benchmarks/bytes_and_flops/main.cpp b/benchmarks/bytes_and_flops/main.cpp index 20077757d1f..8b5adb84440 100644 --- a/benchmarks/bytes_and_flops/main.cpp +++ b/benchmarks/bytes_and_flops/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "bench.hpp" #include extern template void run_stride_unroll(int, int, int, int, int, int, int, From 5c9a4aa3ce9360892d8c558e19618357366b2640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 22 Feb 2024 18:04:09 +0100 Subject: [PATCH 313/432] bytes_and_flops fix a small bug in command line argument --- benchmarks/bytes_and_flops/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bytes_and_flops/main.cpp b/benchmarks/bytes_and_flops/main.cpp index 8b5adb84440..fdfcc4ea64f 100644 --- a/benchmarks/bytes_and_flops/main.cpp +++ b/benchmarks/bytes_and_flops/main.cpp @@ -86,7 +86,7 @@ int main(int argc, char* argv[]) { printf("D must be one of 1,2,4,8,16,32\n"); return 0; } - if ((P < 1) && (P > 2)) { + if ((P < 1) || (P > 4)) { printf("P must be one of 1,2,3,4\n"); return 0; } From 932466f21364b7d99aa06d5e499c7e4bf1a1c00d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 22 Feb 2024 18:10:31 +0100 Subject: [PATCH 314/432] add gather benchmark to CMake --- benchmarks/CMakeLists.txt | 1 + benchmarks/gather/CMakeLists.txt | 4 ++++ benchmarks/gather/gather.hpp | 16 ++++++++-------- benchmarks/gather/gather_unroll.hpp | 2 +- benchmarks/gather/main.cpp | 2 +- 5 files changed, 15 insertions(+), 10 deletions(-) create mode 100644 benchmarks/gather/CMakeLists.txt diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 773544b6f32..285101d2e90 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,4 +1,5 @@ KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) diff --git a/benchmarks/gather/CMakeLists.txt b/benchmarks/gather/CMakeLists.txt new file mode 100644 index 00000000000..24c70627725 --- /dev/null +++ b/benchmarks/gather/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + gather + SOURCES main.cpp +) diff --git a/benchmarks/gather/gather.hpp b/benchmarks/gather/gather.hpp index d83461702c7..90b1101c1d5 100644 --- a/benchmarks/gather/gather.hpp +++ b/benchmarks/gather/gather.hpp @@ -20,28 +20,28 @@ struct RunGather { }; #define UNROLL 1 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "gather_unroll.hpp" #undef UNROLL template diff --git a/benchmarks/gather/gather_unroll.hpp b/benchmarks/gather/gather_unroll.hpp index 5ee5742a3f7..1aa73091bc5 100644 --- a/benchmarks/gather/gather_unroll.hpp +++ b/benchmarks/gather/gather_unroll.hpp @@ -138,7 +138,7 @@ struct RunGather { printf( "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: " "%lf GGather/s: %lf\n", - sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds, + static_cast(sizeof(Scalar) / 4), N, K, D, R, UNROLL, F, seconds, 1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds, 1.e-9 * gather_ops / seconds); } diff --git a/benchmarks/gather/main.cpp b/benchmarks/gather/main.cpp index 7f4fc9ede6c..07fca9fdc64 100644 --- a/benchmarks/gather/main.cpp +++ b/benchmarks/gather/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "gather.hpp" #include int main(int argc, char* argv[]) { From 16d2edbb34925d23602a69ca6866db5d783d2aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 22 Feb 2024 18:14:56 +0100 Subject: [PATCH 315/432] add atomic benchmark to CMake --- benchmarks/CMakeLists.txt | 1 + benchmarks/atomic/CMakeLists.txt | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 benchmarks/atomic/CMakeLists.txt diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 285101d2e90..b2aee8b28bd 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,3 +1,4 @@ +KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) diff --git a/benchmarks/atomic/CMakeLists.txt b/benchmarks/atomic/CMakeLists.txt new file mode 100644 index 00000000000..85f7412f492 --- /dev/null +++ b/benchmarks/atomic/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + atomic + SOURCES main.cpp +) From 750ef211ac68526a6677ed298300e6dc0c4b86a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 22 Feb 2024 18:17:32 +0100 Subject: [PATCH 316/432] add policy_performance benchmark to CMake --- benchmarks/CMakeLists.txt | 1 + benchmarks/policy_performance/CMakeLists.txt | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 benchmarks/policy_performance/CMakeLists.txt diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index b2aee8b28bd..4faeb0a20cf 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -3,4 +3,5 @@ KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) diff --git a/benchmarks/policy_performance/CMakeLists.txt b/benchmarks/policy_performance/CMakeLists.txt new file mode 100644 index 00000000000..929b9c97023 --- /dev/null +++ b/benchmarks/policy_performance/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + policy_performance + SOURCES main.cpp +) From 97fa76f29d35c2dcb364473c3bc759510de45c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 22 Feb 2024 18:21:44 +0100 Subject: [PATCH 317/432] fix some warnings in policy_performance benchmark --- benchmarks/policy_performance/main.cpp | 2 +- benchmarks/policy_performance/policy_perf_test.hpp | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/policy_performance/main.cpp b/benchmarks/policy_performance/main.cpp index 28cfde552a5..fe23fa28453 100644 --- a/benchmarks/policy_performance/main.cpp +++ b/benchmarks/policy_performance/main.cpp @@ -106,7 +106,7 @@ int main(int argc, char* argv[]) { Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) { lval += 1; }, result); diff --git a/benchmarks/policy_performance/policy_perf_test.hpp b/benchmarks/policy_performance/policy_perf_test.hpp index cc2cc40257b..0e23d221f67 100644 --- a/benchmarks/policy_performance/policy_perf_test.hpp +++ b/benchmarks/policy_performance/policy_perf_test.hpp @@ -21,13 +21,13 @@ struct ParallelScanFunctor { using value_type = double; ViewType v; - ParallelScanFunctor(const ViewType& v_) : v(v_) {} + explicit ParallelScanFunctor(const ViewType& v_) : v(v_) {} KOKKOS_INLINE_FUNCTION - void operator()(const int idx, value_type& val, const bool& final) const { + void operator()(const int idx, value_type& val, const bool& is_final) const { // inclusive scan val += v(idx); - if (final) { + if (is_final) { v(idx) = val; } } @@ -109,7 +109,7 @@ void test_policy(int team_range, int thread_range, int vector_range, vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); } v2(idx, t) = vector_result; @@ -128,7 +128,7 @@ void test_policy(int team_range, int thread_range, int vector_range, team_result = 0.0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { lval += 1; }, team_result); + [&](const int, double& lval) { lval += 1; }, team_result); } v1(idx) = team_result; // prevent compiler optimizing loop away @@ -170,13 +170,13 @@ void test_policy(int team_range, int thread_range, int vector_range, for (int tr = 0; tr < thread_repeat; ++tr) { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { + [&](const int, double& lval) { double vector_result = 0.0; for (int vr = 0; vr < inner_repeat; ++vr) { vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); lval += vector_result; } From 715d6156e9729e2be930ff1c7845e32a27efe164 Mon Sep 17 00:00:00 2001 From: Cedric Chevalier Date: Fri, 23 Feb 2024 17:58:05 +0100 Subject: [PATCH 318/432] policy_benchmark: fix indentation --- benchmarks/policy_performance/main.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/policy_performance/main.cpp b/benchmarks/policy_performance/main.cpp index fe23fa28453..0983a3d535c 100644 --- a/benchmarks/policy_performance/main.cpp +++ b/benchmarks/policy_performance/main.cpp @@ -106,8 +106,9 @@ int main(int argc, char* argv[]) { Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, - double& lval) { lval += 1; }, + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) { + lval += 1; + }, result); using view_type_1d = Kokkos::View; From 4dcbff2cf4b6a4b7476b3cf647e7d0b8adf10808 Mon Sep 17 00:00:00 2001 From: Cedric Chevalier Date: Sun, 3 Mar 2024 19:26:49 +0100 Subject: [PATCH 319/432] Benchmarks: disable 2 benchmarks for OpenMPTarget Apply Rahul suggestion to disable two benchmarks that are causing Internal Compiler Errors with OpenMPTarget. --- benchmarks/CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4faeb0a20cf..abf50283594 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,7 +1,12 @@ +#FIXME_OPENMPTARGET - compiling in debug mode causes ICE. KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) + +#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. +IF(NOT Kokkos_ENABLE_OPENMPTARGET) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) +ENDIF() From 3d485c19da4aea22021df2673df2183d35fc87f2 Mon Sep 17 00:00:00 2001 From: Cedric Chevalier Date: Tue, 5 Mar 2024 10:13:33 +0100 Subject: [PATCH 320/432] bytes_and_flops: fix a counter name --- benchmarks/bytes_and_flops/bench_unroll_stride.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index e2590911b20..78cfd48effe 100644 --- a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -87,25 +87,25 @@ struct Run { C(n, i, 0) = a1; #endif #if (UNROLL == 2) - C(n, iter, 0) = a1 + a2; + C(n, i, 0) = a1 + a2; #endif #if (UNROLL == 3) - C(n, iter, 0) = a1 + a2 + a3; + C(n, i, 0) = a1 + a2 + a3; #endif #if (UNROLL == 4) - C(n, iter, 0) = a1 + a2 + a3 + a4; + C(n, i, 0) = a1 + a2 + a3 + a4; #endif #if (UNROLL == 5) - C(n, iter, 0) = a1 + a2 + a3 + a4 + a5; + C(n, i, 0) = a1 + a2 + a3 + a4 + a5; #endif #if (UNROLL == 6) - C(n, iter, 0) = a1 + a2 + a3 + a4 + a5 + a6; + C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6; #endif #if (UNROLL == 7) - C(n, iter, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7; + C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7; #endif #if (UNROLL == 8) - C(n, iter, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8; + C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8; #endif }); } From 97a94b60af6f422304e07726c016afe7f41f8380 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 4 Mar 2024 10:12:45 -0500 Subject: [PATCH 321/432] Fix C-style cast --- core/src/Serial/Kokkos_Serial_Task.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Serial/Kokkos_Serial_Task.hpp b/core/src/Serial/Kokkos_Serial_Task.hpp index f9c86f55ce0..5905d6d32e1 100644 --- a/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/core/src/Serial/Kokkos_Serial_Task.hpp @@ -121,7 +121,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; auto& data = serial_execution_space.impl_internal_space_instance() @@ -157,7 +157,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; From 99c7e1b1c430f2012eaea2f4bec90e8e0858b9f9 Mon Sep 17 00:00:00 2001 From: Thomas Padioleau Date: Tue, 5 Mar 2024 19:26:06 +0100 Subject: [PATCH 322/432] Fix amdclang++ compilation (#6857) * Fix amdclang++ compilation * Add guards for hipcc --- cmake/kokkos_arch.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 5ee6f44613e..34e9f05986f 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -571,6 +571,11 @@ IF (KOKKOS_ENABLE_HIP) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) + IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT --hip-link + ) + ENDIF() ELSE() COMPILER_SPECIFIC_FLAGS( DEFAULT -fno-gpu-rdc From 9feb104d9b3ce86230f1b56c6207246f67f22de1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 5 Mar 2024 13:40:20 -0500 Subject: [PATCH 323/432] Fix fallback implementation for sort_by_key (#6856) * Fix fallback implementation for sort_by_key * Guard with KOKKOS_ENABLE_ONEDPL * Drop sort_on_device * Improve wording * Improve comment --- .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 119 ++++++++++++------ 1 file changed, 80 insertions(+), 39 deletions(-) diff --git a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index 698d059f939..5dc7047dde3 100644 --- a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -87,7 +87,20 @@ static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) { "LayoutRight, LayoutLeft or LayoutStride."); } +// For the fallback implementation for sort_by_key using Kokkos::sort, we need +// to consider if Kokkos::sort defers to the fallback implementation that copies +// the array to the host and uses std::sort, see +// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If +// sort_on_device_v is true, we assume that std::sort doesn't copy data. +// Otherwise, we manually copy all data to the host and provide Kokkos::sort +// with a host execution space. +template +inline constexpr bool sort_on_device_v = false; + #if defined(KOKKOS_ENABLE_CUDA) +template +inline constexpr bool sort_on_device_v = true; + template void sort_by_key_cudathrust( @@ -104,6 +117,12 @@ void sort_by_key_cudathrust( } #endif +#if defined(KOKKOS_ENABLE_ONEDPL) +template +inline constexpr bool sort_on_device_v = + std::is_same_v || + std::is_same_v; + #ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY template @@ -126,6 +145,7 @@ void sort_by_key_onedpl( std::forward(maybeComparator)...); } #endif +#endif template void applyPermutation(const ExecutionSpace& space, @@ -152,6 +172,8 @@ void sort_by_key_via_sort( const Kokkos::View& keys, const Kokkos::View& values, MaybeComparator&&... maybeComparator) { + static_assert(sizeof...(MaybeComparator) <= 1); + auto const n = keys.size(); Kokkos::View permute( @@ -165,48 +187,67 @@ void sort_by_key_via_sort( Kokkos::RangePolicy(exec, 0, n), KOKKOS_LAMBDA(int i) { permute(i) = i; }); -// FIXME OPENMPTARGET The sort happens on the host so we have to copy keys there -#ifdef KOKKOS_ENABLE_OPENMPTARGET - auto keys_in_comparator = Kokkos::create_mirror_view( - Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), - keys); - Kokkos::deep_copy(exec, keys_in_comparator, keys); -#else - auto keys_in_comparator = keys; -#endif - - static_assert(sizeof...(MaybeComparator) <= 1); - if constexpr (sizeof...(MaybeComparator) == 0) { -#ifdef KOKKOS_ENABLE_SYCL - auto* raw_keys_in_comparator = keys_in_comparator.data(); - auto stride = keys_in_comparator.stride(0); - Kokkos::sort( - exec, permute, KOKKOS_LAMBDA(int i, int j) { - return raw_keys_in_comparator[i * stride] < - raw_keys_in_comparator[j * stride]; - }); -#else - Kokkos::sort( - exec, permute, KOKKOS_LAMBDA(int i, int j) { - return keys_in_comparator(i) < keys_in_comparator(j); - }); -#endif + using Layout = + typename Kokkos::View::array_layout; + if constexpr (!sort_on_device_v) { + auto host_keys = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + keys); + auto host_permute = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + permute); + Kokkos::deep_copy(exec, host_keys, keys); + Kokkos::deep_copy(exec, host_permute, permute); + + exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort"); + Kokkos::DefaultHostExecutionSpace host_exec; + + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + host_exec, host_permute, + KOKKOS_LAMBDA(int i, int j) { return host_keys(i) < host_keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + host_exec, host_permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(host_keys(i), host_keys(j)); + }); + } + host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort"); + Kokkos::deep_copy(exec, permute, host_permute); } else { - auto keys_comparator = - std::get<0>(std::tuple(maybeComparator...)); #ifdef KOKKOS_ENABLE_SYCL - auto* raw_keys_in_comparator = keys_in_comparator.data(); - auto stride = keys_in_comparator.stride(0); - Kokkos::sort( - exec, permute, KOKKOS_LAMBDA(int i, int j) { - return keys_comparator(raw_keys_in_comparator[i * stride], - raw_keys_in_comparator[j * stride]); - }); + auto* raw_keys_in_comparator = keys.data(); + auto stride = keys.stride(0); + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return raw_keys_in_comparator[i * stride] < + raw_keys_in_comparator[j * stride]; + }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(raw_keys_in_comparator[i * stride], + raw_keys_in_comparator[j * stride]); + }); + } #else - Kokkos::sort( - exec, permute, KOKKOS_LAMBDA(int i, int j) { - return keys_comparator(keys_in_comparator(i), keys_in_comparator(j)); - }); + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, + KOKKOS_LAMBDA(int i, int j) { return keys(i) < keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(keys(i), keys(j)); + }); + } #endif } From c3c8a70d21ffe88f810bfe392fb5a0457c62c928 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> Date: Tue, 5 Mar 2024 11:52:20 -0700 Subject: [PATCH 324/432] Update the unsafe implicit conversion error message in MDRangePolicy (#6855) * Updated the error message in MDRangePolicy about unsafe implicit conversions * Addressed clang warnings --- core/src/KokkosExp_MDRangePolicy.hpp | 19 ++++++++++++------- .../TestMDRangePolicyConstructors.hpp | 13 ++++++++----- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp index ff49c13cbad..2df274b81f9 100644 --- a/core/src/KokkosExp_MDRangePolicy.hpp +++ b/core/src/KokkosExp_MDRangePolicy.hpp @@ -73,7 +73,7 @@ is_less_than_value_initialized_variable(T arg) { // Checked narrowing conversion that calls abort if the cast changes the value template -constexpr To checked_narrow_cast(From arg) { +constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = (std::is_signed::value != std::is_signed::value); auto const ret = static_cast(arg); @@ -81,7 +81,12 @@ constexpr To checked_narrow_cast(From arg) { (is_different_signedness && is_less_than_value_initialized_variable(arg) != is_less_than_value_initialized_variable(ret))) { - Kokkos::abort("unsafe narrowing conversion"); + auto msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(arg) + ") in dimension (" + std::to_string(idx) + + "), which may not preserve its original value.\n"; + Kokkos::abort(msg.c_str()); } return ret; } @@ -102,9 +107,9 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { // std::transform(std::begin(init), std::end(init), a.data(), // [](U x) { return static_cast(x); }); // except that std::transform is not constexpr. - for (auto x : init) { - *ptr++ = checked_narrow_cast(x); - (void)checked_narrow_cast(x); // see note above + for (std::size_t i = 0; i < M; ++i) { + *ptr++ = checked_narrow_cast(init[i], i); + (void)checked_narrow_cast(init[i], i); // see note above } return a; } @@ -122,8 +127,8 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( constexpr std::size_t N = a.size(); static_assert(M <= N); for (std::size_t i = 0; i < M; ++i) { - a[i] = checked_narrow_cast(other[i]); - (void)checked_narrow_cast(other[i]); // see note above + a[i] = checked_narrow_cast(other[i], i); + (void)checked_narrow_cast(other[i], i); // see note above } return a; } diff --git a/core/unit_test/TestMDRangePolicyConstructors.hpp b/core/unit_test/TestMDRangePolicyConstructors.hpp index 306f89413e0..6f241b45d47 100644 --- a/core/unit_test/TestMDRangePolicyConstructors.hpp +++ b/core/unit_test/TestMDRangePolicyConstructors.hpp @@ -88,12 +88,15 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) { using Policy = Kokkos::MDRangePolicy, Kokkos::IndexType>; + std::string msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is " + "performed on a bound (-1) in dimension (0), which may not preserve its " + "original value.\n"; + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { - (void)Policy({-1, 0}, {2, 3}); - }, - "unsafe narrowing conversion"); + ASSERT_DEATH({ (void)Policy({-1, 0}, {2, 3}); }, expected); } TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) { From 9a7e7958ae6e35124546663936520f93f964d021 Mon Sep 17 00:00:00 2001 From: Nicolas Morales Date: Tue, 5 Mar 2024 19:09:54 -0800 Subject: [PATCH 325/432] Split some classes from Kokkos_ViewMapping (#6859) * move ViewOffset and ViewDataAnalysis to a separate header * only include Kokkos_Macro in ViewDataAnalysis --- core/src/impl/Kokkos_ViewDataAnalysis.hpp | 402 ++++++++++++++++++++++ core/src/impl/Kokkos_ViewMapping.hpp | 392 +-------------------- 2 files changed, 403 insertions(+), 391 deletions(-) create mode 100644 core/src/impl/Kokkos_ViewDataAnalysis.hpp diff --git a/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/core/src/impl/Kokkos_ViewDataAnalysis.hpp new file mode 100644 index 00000000000..04c0c9aeede --- /dev/null +++ b/core/src/impl/Kokkos_ViewDataAnalysis.hpp @@ -0,0 +1,402 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_VIEW_DATA_ANALYSIS_HPP +#define KOKKOS_VIEW_DATA_ANALYSIS_HPP + +#include + +namespace Kokkos::Impl { + +template +struct variadic_size_t { + enum : size_t { value = KOKKOS_INVALID_INDEX }; +}; + +template +struct variadic_size_t<0, Val, Args...> { + enum : size_t { value = Val }; +}; + +template +struct variadic_size_t { + enum : size_t { value = variadic_size_t::value }; +}; + +template +struct rank_dynamic; + +template <> +struct rank_dynamic<> { + enum : unsigned { value = 0 }; +}; + +template +struct rank_dynamic { + enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; +}; + +#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ + template \ + struct ViewDimension##R { \ + static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + }; \ + template \ + constexpr size_t ViewDimension##R::ArgN##R; \ + template \ + constexpr size_t ViewDimension##R::N##R; \ + template \ + struct ViewDimension##R<0u, RD> { \ + static constexpr size_t ArgN##R = 0; \ + std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ + }; \ + template \ + constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; + +KOKKOS_IMPL_VIEW_DIMENSION(0) +KOKKOS_IMPL_VIEW_DIMENSION(1) +KOKKOS_IMPL_VIEW_DIMENSION(2) +KOKKOS_IMPL_VIEW_DIMENSION(3) +KOKKOS_IMPL_VIEW_DIMENSION(4) +KOKKOS_IMPL_VIEW_DIMENSION(5) +KOKKOS_IMPL_VIEW_DIMENSION(6) +KOKKOS_IMPL_VIEW_DIMENSION(7) + +#undef KOKKOS_IMPL_VIEW_DIMENSION + +// MSVC does not do empty base class optimization by default. +// Per standard it is required for standard layout types +template +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension + : public ViewDimension0::value, + rank_dynamic::value>, + public ViewDimension1::value, + rank_dynamic::value>, + public ViewDimension2::value, + rank_dynamic::value>, + public ViewDimension3::value, + rank_dynamic::value>, + public ViewDimension4::value, + rank_dynamic::value>, + public ViewDimension5::value, + rank_dynamic::value>, + public ViewDimension6::value, + rank_dynamic::value>, + public ViewDimension7::value, + rank_dynamic::value> { + using D0 = ViewDimension0::value, + rank_dynamic::value>; + using D1 = ViewDimension1::value, + rank_dynamic::value>; + using D2 = ViewDimension2::value, + rank_dynamic::value>; + using D3 = ViewDimension3::value, + rank_dynamic::value>; + using D4 = ViewDimension4::value, + rank_dynamic::value>; + using D5 = ViewDimension5::value, + rank_dynamic::value>; + using D6 = ViewDimension6::value, + rank_dynamic::value>; + using D7 = ViewDimension7::value, + rank_dynamic::value>; + + using D0::ArgN0; + using D1::ArgN1; + using D2::ArgN2; + using D3::ArgN3; + using D4::ArgN4; + using D5::ArgN5; + using D6::ArgN6; + using D7::ArgN7; + + using D0::N0; + using D1::N1; + using D2::N2; + using D3::N3; + using D4::N4; + using D5::N5; + using D6::N6; + using D7::N7; + + static constexpr unsigned rank = sizeof...(Vals); + static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; + + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; + ViewDimension& operator=(const ViewDimension&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, + size_t n5, size_t n6, size_t n7) + : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), + D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), + D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), + D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), + D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), + D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), + D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), + D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} + + KOKKOS_INLINE_FUNCTION + constexpr size_t extent(const unsigned r) const noexcept { + return r == 0 + ? N0 + : (r == 1 + ? N1 + : (r == 2 + ? N2 + : (r == 3 + ? N3 + : (r == 4 + ? N4 + : (r == 5 + ? N5 + : (r == 6 + ? N6 + : (r == 7 ? N7 + : 0))))))); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return r == 0 + ? ArgN0 + : (r == 1 + ? ArgN1 + : (r == 2 + ? ArgN2 + : (r == 3 + ? ArgN3 + : (r == 4 + ? ArgN4 + : (r == 5 + ? ArgN5 + : (r == 6 + ? ArgN6 + : (r == 7 ? ArgN7 + : 0))))))); + } + + template + struct prepend { + using type = ViewDimension; + }; + + template + struct append { + using type = ViewDimension; + }; +}; + +template +struct ViewDimensionJoin; + +template +struct ViewDimensionJoin, ViewDimension> { + using type = ViewDimension; +}; + +//---------------------------------------------------------------------------- + +template +struct ViewDimensionAssignable; + +template +struct ViewDimensionAssignable, + ViewDimension> { + using dst = ViewDimension; + using src = ViewDimension; + + enum { + value = unsigned(dst::rank) == unsigned(src::rank) && + ( + // Compile time check that potential static dimensions match + ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) + ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) + : true) && + ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) + ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) + : true) && + ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) + ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) + : true) && + ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) + ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) + : true) && + ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) + ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) + : true) && + ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) + ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) + : true) && + ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) + ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) + : true) && + ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) + ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) + : true)) + }; +}; + +/** \brief Given a value type and dimension generate the View data type */ +template +struct ViewDataType; + +template +struct ViewDataType> { + using type = T; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type[N]; +}; + +/**\brief Analysis of View data type. + * + * Data type conforms to one of the following patterns : + * {const} value_type [][#][#][#] + * {const} value_type ***[#][#][#] + * Where the sum of counts of '*' and '[#]' is at most ten. + * + * Provide alias for ViewDimension<...> and value_type. + */ +template +struct ViewArrayAnalysis { + using value_type = T; + using const_value_type = std::add_const_t; + using non_const_value_type = std::remove_const_t; + using static_dimension = ViewDimension<>; + using dynamic_dimension = ViewDimension<>; + using dimension = ViewDimension<>; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using static_dimension = + typename nested::static_dimension::template prepend::type; + + using dynamic_dimension = typename nested::dynamic_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + using nested_dimension = typename nested::dimension; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewDataAnalysis { + private: + using array_analysis = ViewArrayAnalysis; + + // ValueType is opportunity for partial specialization. + // Must match array analysis when this default template is used. + static_assert( + std::is_same::value); + + public: + using specialize = void; // No specialization + + using dimension = typename array_analysis::dimension; + using value_type = typename array_analysis::value_type; + using const_value_type = typename array_analysis::const_value_type; + using non_const_value_type = typename array_analysis::non_const_value_type; + + // Generate analogous multidimensional array specification type. + using type = typename ViewDataType::type; + using const_type = typename ViewDataType::type; + using non_const_type = + typename ViewDataType::type; + + // Generate "flattened" multidimensional array specification type. + using scalar_array_type = type; + using const_scalar_array_type = const_type; + using non_const_scalar_array_type = non_const_type; +}; + +template +struct ViewOffset { + using is_mapping_plugin = std::false_type; +}; +} // namespace Kokkos::Impl + +#endif // KOKKOS_VIEW_DATA_ANALYSIS_HPP diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 708472be9d3..a2b41d98a91 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -33,255 +33,7 @@ #include #include #include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct variadic_size_t { - enum : size_t { value = KOKKOS_INVALID_INDEX }; -}; - -template -struct variadic_size_t<0, Val, Args...> { - enum : size_t { value = Val }; -}; - -template -struct variadic_size_t { - enum : size_t { value = variadic_size_t::value }; -}; - -template -struct rank_dynamic; - -template <> -struct rank_dynamic<> { - enum : unsigned { value = 0 }; -}; - -template -struct rank_dynamic { - enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; -}; - -#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ - template \ - struct ViewDimension##R { \ - static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - }; \ - template \ - constexpr size_t ViewDimension##R::ArgN##R; \ - template \ - constexpr size_t ViewDimension##R::N##R; \ - template \ - struct ViewDimension##R<0u, RD> { \ - static constexpr size_t ArgN##R = 0; \ - std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ - }; \ - template \ - constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; - -KOKKOS_IMPL_VIEW_DIMENSION(0) -KOKKOS_IMPL_VIEW_DIMENSION(1) -KOKKOS_IMPL_VIEW_DIMENSION(2) -KOKKOS_IMPL_VIEW_DIMENSION(3) -KOKKOS_IMPL_VIEW_DIMENSION(4) -KOKKOS_IMPL_VIEW_DIMENSION(5) -KOKKOS_IMPL_VIEW_DIMENSION(6) -KOKKOS_IMPL_VIEW_DIMENSION(7) - -#undef KOKKOS_IMPL_VIEW_DIMENSION - -// MSVC does not do empty base class optimization by default. -// Per standard it is required for standard layout types -template -struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension - : public ViewDimension0::value, - rank_dynamic::value>, - public ViewDimension1::value, - rank_dynamic::value>, - public ViewDimension2::value, - rank_dynamic::value>, - public ViewDimension3::value, - rank_dynamic::value>, - public ViewDimension4::value, - rank_dynamic::value>, - public ViewDimension5::value, - rank_dynamic::value>, - public ViewDimension6::value, - rank_dynamic::value>, - public ViewDimension7::value, - rank_dynamic::value> { - using D0 = ViewDimension0::value, - rank_dynamic::value>; - using D1 = ViewDimension1::value, - rank_dynamic::value>; - using D2 = ViewDimension2::value, - rank_dynamic::value>; - using D3 = ViewDimension3::value, - rank_dynamic::value>; - using D4 = ViewDimension4::value, - rank_dynamic::value>; - using D5 = ViewDimension5::value, - rank_dynamic::value>; - using D6 = ViewDimension6::value, - rank_dynamic::value>; - using D7 = ViewDimension7::value, - rank_dynamic::value>; - - using D0::ArgN0; - using D1::ArgN1; - using D2::ArgN2; - using D3::ArgN3; - using D4::ArgN4; - using D5::ArgN5; - using D6::ArgN6; - using D7::ArgN7; - - using D0::N0; - using D1::N1; - using D2::N2; - using D3::N3; - using D4::N4; - using D5::N5; - using D6::N6; - using D7::N7; - - static constexpr unsigned rank = sizeof...(Vals); - static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; - ViewDimension& operator=(const ViewDimension&) = default; - - KOKKOS_INLINE_FUNCTION - constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, - size_t n5, size_t n6, size_t n7) - : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), - D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), - D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), - D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), - D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), - D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), - D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), - D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} - - KOKKOS_INLINE_FUNCTION - constexpr size_t extent(const unsigned r) const noexcept { - return r == 0 - ? N0 - : (r == 1 - ? N1 - : (r == 2 - ? N2 - : (r == 3 - ? N3 - : (r == 4 - ? N4 - : (r == 5 - ? N5 - : (r == 6 - ? N6 - : (r == 7 ? N7 - : 0))))))); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return r == 0 - ? ArgN0 - : (r == 1 - ? ArgN1 - : (r == 2 - ? ArgN2 - : (r == 3 - ? ArgN3 - : (r == 4 - ? ArgN4 - : (r == 5 - ? ArgN5 - : (r == 6 - ? ArgN6 - : (r == 7 ? ArgN7 - : 0))))))); - } - - template - struct prepend { - using type = ViewDimension; - }; - - template - struct append { - using type = ViewDimension; - }; -}; - -template -struct ViewDimensionJoin; - -template -struct ViewDimensionJoin, ViewDimension> { - using type = ViewDimension; -}; - -//---------------------------------------------------------------------------- - -template -struct ViewDimensionAssignable; - -template -struct ViewDimensionAssignable, - ViewDimension> { - using dst = ViewDimension; - using src = ViewDimension; - - enum { - value = unsigned(dst::rank) == unsigned(src::rank) && - ( - // Compile time check that potential static dimensions match - ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) - ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) - : true) && - ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) - ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) - : true) && - ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) - ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) - : true) && - ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) - ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) - : true) && - ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) - ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) - : true) && - ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) - ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) - : true) && - ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) - ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) - : true) && - ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) - ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) - : true)) - }; -}; - -} // namespace Impl -} // namespace Kokkos +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -707,148 +459,6 @@ struct SubviewExtents { namespace Kokkos { namespace Impl { - -/** \brief Given a value type and dimension generate the View data type */ -template -struct ViewDataType; - -template -struct ViewDataType> { - using type = T; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type[N]; -}; - -/**\brief Analysis of View data type. - * - * Data type conforms to one of the following patterns : - * {const} value_type [][#][#][#] - * {const} value_type ***[#][#][#] - * Where the sum of counts of '*' and '[#]' is at most ten. - * - * Provide alias for ViewDimension<...> and value_type. - */ -template -struct ViewArrayAnalysis { - using value_type = T; - using const_value_type = std::add_const_t; - using non_const_value_type = std::remove_const_t; - using static_dimension = ViewDimension<>; - using dynamic_dimension = ViewDimension<>; - using dimension = ViewDimension<>; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using static_dimension = - typename nested::static_dimension::template prepend::type; - - using dynamic_dimension = typename nested::dynamic_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - using nested_dimension = typename nested::dimension; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewDataAnalysis { - private: - using array_analysis = ViewArrayAnalysis; - - // ValueType is opportunity for partial specialization. - // Must match array analysis when this default template is used. - static_assert( - std::is_same::value); - - public: - using specialize = void; // No specialization - - using dimension = typename array_analysis::dimension; - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - // Generate analogous multidimensional array specification type. - using type = typename ViewDataType::type; - using const_type = typename ViewDataType::type; - using non_const_type = - typename ViewDataType::type; - - // Generate "flattened" multidimensional array specification type. - using scalar_array_type = type; - using const_scalar_array_type = const_type; - using non_const_scalar_array_type = non_const_type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewOffset { - using is_mapping_plugin = std::false_type; -}; - //---------------------------------------------------------------------------- // LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding template From 121964a93a316d54cea8e11668eda281cea38962 Mon Sep 17 00:00:00 2001 From: Nicolas Morales Date: Tue, 5 Mar 2024 13:26:59 -0800 Subject: [PATCH 326/432] update mdspan tpl --- .../__p0009_bits/compressed_pair.hpp | 166 ++-- .../experimental/__p0009_bits/config.hpp | 9 +- .../experimental/__p0009_bits/extents.hpp | 68 +- .../experimental/__p0009_bits/layout_left.hpp | 78 +- .../__p0009_bits/layout_right.hpp | 78 +- .../__p0009_bits/layout_stride.hpp | 115 ++- .../experimental/__p0009_bits/mdspan.hpp | 65 +- .../experimental/__p1684_bits/mdarray.hpp | 70 +- .../__p2630_bits/strided_slice.hpp | 9 +- .../experimental/__p2630_bits/submdspan.hpp | 10 +- .../__p2630_bits/submdspan_extents.hpp | 2 - .../__p2630_bits/submdspan_mapping.hpp | 70 +- .../__p2642_bits/layout_padded.hpp | 793 ++++++++++++++++++ .../__p2642_bits/layout_padded_fwd.hpp | 117 +++ tpls/mdspan/include/mdspan/mdspan.hpp | 1 + 15 files changed, 1359 insertions(+), 292 deletions(-) create mode 100644 tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp create mode 100644 tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp diff --git a/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp b/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp index ab1561bd47f..25389a2fa5e 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp @@ -27,165 +27,165 @@ namespace detail { // For no unique address emulation, this is the case taken when neither are empty. // For real `[[no_unique_address]]`, this case is always taken. -template struct __compressed_pair { - _MDSPAN_NO_UNIQUE_ADDRESS _T __t_val; - _MDSPAN_NO_UNIQUE_ADDRESS _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; +template struct __compressed_pair { + _MDSPAN_NO_UNIQUE_ADDRESS _T1 __t1_val{}; + _MDSPAN_NO_UNIQUE_ADDRESS _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : __t_val((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : __t1_val((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) // First empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && !_MDSPAN_TRAIT(std::is_empty, _U)>> - : private _T { - _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { - return *static_cast<_T *>(this); + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && !_MDSPAN_TRAIT(std::is_empty, _T2)>> + : private _T1 { + _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { + return *static_cast<_T1 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return *static_cast<_T const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return *static_cast<_T1 const *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _T((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T1((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; // Second empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t> - : private _U { - _T __t_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; + _T1, _T2, + std::enable_if_t> + : private _T2 { + _T1 __t1_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { - return *static_cast<_U *>(this); + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { + return *static_cast<_T2 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return *static_cast<_U const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return *static_cast<_T2 const *>(this); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; + ~__compressed_pair() = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _U((_ULike &&) __u), __t_val((_TLike &&) __t) {} + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T2((_T2Like &&) __t2), __t1_val((_T1Like &&) __t1) {} }; // Both empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>> + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>> // We need to use the __no_unique_address_emulation wrapper here to avoid // base class ambiguities. #ifdef _MDSPAN_COMPILER_MSVC // MSVC doesn't allow you to access public static member functions of a type // when you *happen* to privately inherit from that type. - : protected __no_unique_address_emulation<_T, 0>, - protected __no_unique_address_emulation<_U, 1> + : protected __no_unique_address_emulation<_T1, 0>, + protected __no_unique_address_emulation<_T2, 1> #else - : private __no_unique_address_emulation<_T, 0>, - private __no_unique_address_emulation<_U, 1> + : private __no_unique_address_emulation<_T1, 0>, + private __no_unique_address_emulation<_T2, 1> #endif { - using __first_base_t = __no_unique_address_emulation<_T, 0>; - using __second_base_t = __no_unique_address_emulation<_U, 1>; + using __first_base_t = __no_unique_address_emulation<_T1, 0>; + using __second_base_t = __no_unique_address_emulation<_T2, 1>; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return this->__second_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { return this->__second_base_t::__ref(); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) noexcept - : __first_base_t(_T((_TLike &&) __t)), - __second_base_t(_U((_ULike &&) __u)) + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) noexcept + : __first_base_t(_T1((_T1Like &&) __t1)), + __second_base_t(_T2((_T2Like &&) __t2)) { } }; diff --git a/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/tpls/mdspan/include/experimental/__p0009_bits/config.hpp index d35e201cebd..8e42a37ba7c 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/config.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/config.hpp @@ -35,10 +35,17 @@ #define MDSPAN_CXX_STD_14 201402L #define MDSPAN_CXX_STD_17 201703L #define MDSPAN_CXX_STD_20 202002L +// Note GCC has not updated this in version 13 +#ifdef __clang__ +#define MDSPAN_CXX_STD_23 202302L +#else +#define MDSPAN_CXX_STD_23 202100L +#endif #define MDSPAN_HAS_CXX_14 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14) #define MDSPAN_HAS_CXX_17 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_17) #define MDSPAN_HAS_CXX_20 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_20) +#define MDSPAN_HAS_CXX_23 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_23) static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or later."); @@ -224,7 +231,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #endif #ifndef MDSPAN_CONDITIONAL_EXPLICIT -# if MDSPAN_HAS_CXX_20 && !defined(_MDSPAN_COMPILER_MSVC) +# if MDSPAN_HAS_CXX_20 # define MDSPAN_CONDITIONAL_EXPLICIT(COND) explicit(COND) # else # define MDSPAN_CONDITIONAL_EXPLICIT(COND) diff --git a/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp index 0dd31c4cd0a..9a28c3ed5ca 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp @@ -55,6 +55,14 @@ __check_compatible_extents( return {}; } +template +MDSPAN_INLINE_FUNCTION +static constexpr bool are_valid_indices() { + return + (std::is_convertible::value && ... && true) && + (std::is_nothrow_constructible::value && ... && true); +} + // ------------------------------------------------------------------ // ------------ static_array ---------------------------------------- // ------------------------------------------------------------------ @@ -140,7 +148,8 @@ struct index_sequence_scan_impl { template struct index_sequence_scan_impl { -#if defined(__NVCC__) || defined(__NVCOMPILER) +#if defined(__NVCC__) || defined(__NVCOMPILER) || \ + defined(_MDSPAN_COMPILER_INTEL) // NVCC warns about pointless comparison with 0 for R==0 and r being const // evaluatable and also 0. MDSPAN_INLINE_FUNCTION @@ -167,7 +176,7 @@ template <> struct index_sequence_scan_impl<0> { // all static values. template struct possibly_empty_array { - T vals[N]; + T vals[N]{}; MDSPAN_INLINE_FUNCTION constexpr T &operator[](size_t r) { return vals[r]; } MDSPAN_INLINE_FUNCTION @@ -251,12 +260,17 @@ struct maybe_static_array { #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, - /* requires */ (N == m_size_dynamic)) + /* requires */ (N == m_size_dynamic && N > 0)) MDSPAN_INLINE_FUNCTION constexpr maybe_static_array(const std::span &vals) { for (size_t r = 0; r < N; r++) m_dyn_vals[r] = static_cast(vals[r]); } + + MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, + /* requires */ (N == m_size_dynamic && N == 0)) + MDSPAN_INLINE_FUNCTION + constexpr maybe_static_array(const std::span &) : m_dyn_vals{} {} #endif // constructors from all values @@ -423,9 +437,9 @@ template class extents { class OtherIndexType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && + _MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, - OtherIndexType) && + const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -436,8 +450,8 @@ template class extents { MDSPAN_TEMPLATE_REQUIRES( class OtherIndexType, size_t N, /* requires */ - (_MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, OtherIndexType) && + (_MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -454,6 +468,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) == dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -468,6 +483,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) != dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -481,6 +497,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R == m_rank) && (DynCount == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &, @@ -491,17 +508,20 @@ template class extents { public: // Converting constructor from other extents specializations - MDSPAN_TEMPLATE_REQUIRES( - class OtherIndexType, size_t... OtherExtents, - /* requires */ - ( - /* multi-stage check to protect from invalid pack expansion when sizes - don't match? */ - decltype(detail::__check_compatible_extents( - std::integral_constant{}, + MDSPAN_TEMPLATE_REQUIRES( + class OtherIndexType, size_t... OtherExtents, + /* requires */ + ( + /* multi-stage check to protect from invalid pack expansion when sizes + don't match? */ + decltype(detail::__check_compatible_extents( + // using: sizeof...(Extents) == sizeof...(OtherExtents) as the second argument fails with MSVC+NVCC with some obscure expansion error + // MSVC: 19.38.33133 NVCC: 12.0 + std::integral_constant::rank() == extents::rank()>{}, std::integer_sequence{}, - std::integer_sequence{}))::value)) + std::integer_sequence{}))::value + ) + ) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT((((Extents != dynamic_extent) && (OtherExtents == dynamic_extent)) || @@ -518,10 +538,14 @@ template class extents { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const extents &lhs, const extents &rhs) noexcept { - bool value = true; - for (size_type r = 0; r < m_rank; r++) - value &= rhs.extent(r) == lhs.extent(r); - return value; + if constexpr (rank() != extents::rank()) { + return false; + } else { + using common_t = std::common_type_t; + for (size_type r = 0; r < m_rank; r++) + if(static_cast(rhs.extent(r)) != static_cast(lhs.extent(r))) return false; + } + return true; } #if !(MDSPAN_HAS_CXX_20) @@ -570,7 +594,7 @@ using dextents = typename detail::__make_dextents::type; template extents(IndexTypes...) -> extents; + ((void) sizeof(IndexTypes), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent)...>; #endif // Helper type traits for identifying a class as extents. diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp index af44494a98d..83ed9ef7fe3 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp @@ -18,6 +18,9 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" +#include +#include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -108,6 +111,36 @@ class layout_left::mapping { */ } +#if MDSPAN_HAS_CXX_17 + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_left_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping& __other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -124,13 +157,14 @@ class layout_left::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=0; r<__extents.rank(); r++) { - if(stride != static_cast(other.stride(r))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_left with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=0; r<__extents.rank(); r++) { + if(static_cast(stride) != static_cast(other.stride(r))) + std::abort(); // ("Assigning layout_stride to layout_left with invalid strides."); + stride *= __extents.extent(r); } - stride *= __extents.extent(r); } #endif } @@ -155,10 +189,7 @@ class layout_left::mapping { class... Indices, /* requires */ ( (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -172,9 +203,9 @@ class layout_left::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -187,7 +218,10 @@ class layout_left::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -195,7 +229,10 @@ class layout_left::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -215,6 +252,17 @@ class layout_left::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp index a0586484202..3d3927df7bc 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp @@ -20,6 +20,7 @@ #include "extents.hpp" #include #include "layout_stride.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -113,6 +114,34 @@ class layout_right::mapping { */ } + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_right_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ +#if MDSPAN_HAS_CXX_17 + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping &__other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -129,13 +158,14 @@ class layout_right::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=__extents.rank(); r>0; r--) { - if(stride != static_cast(other.stride(r-1))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_right with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=__extents.rank(); r>0; r--) { + if(static_cast(stride) != static_cast(other.stride(r-1))) + std::abort(); // ("Assigning layout_stride to layout_right with invalid strides."); + stride *= __extents.extent(r-1); } - stride *= __extents.extent(r-1); } #endif } @@ -157,13 +187,10 @@ class layout_right::mapping { //-------------------------------------------------------------------------------- MDSPAN_TEMPLATE_REQUIRES( - class... Indices, + class ... Indices, /* requires */ ( - (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (sizeof...(Indices) == extents_type::rank()) && + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -174,9 +201,9 @@ class layout_right::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -189,7 +216,10 @@ class layout_right::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -197,7 +227,10 @@ class layout_right::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ (Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -217,6 +250,17 @@ class layout_right::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 030a494529b..15ad577d149 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -88,7 +88,7 @@ struct layout_stride { : private detail::__no_unique_address_emulation< detail::__compressed_pair< Extents, - std::array + detail::possibly_empty_array > > #endif @@ -109,7 +109,7 @@ struct layout_stride { //---------------------------------------------------------------------------- - using __strides_storage_t = std::array; + using __strides_storage_t = detail::possibly_empty_array; using __member_pair_t = detail::__compressed_pair; #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) @@ -158,14 +158,16 @@ struct layout_stride { template MDSPAN_INLINE_FUNCTION static constexpr bool _eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */) - && _MDSPAN_FOLD_AND((self.extents().extent(Idxs) == other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_AND((static_cast(self.stride(Idxs)) == static_cast(other.stride(Idxs))) /* && ... */) + && _MDSPAN_FOLD_AND((static_cast(self.extents().extent(Idxs)) == static_cast(other.extents().extent(Idxs))) /* || ... */); } template MDSPAN_INLINE_FUNCTION static constexpr bool _not_eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */) - || _MDSPAN_FOLD_OR((self.extents().extent(Idxs) != other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_OR((static_cast(self.stride(Idxs)) != static_cast(other.stride(Idxs))) /* || ... */) + || _MDSPAN_FOLD_OR((static_cast(self.extents().extent(Idxs)) != static_cast(other.extents().extent(Idxs))) /* || ... */); } template @@ -205,6 +207,11 @@ struct layout_stride { } #endif + MDSPAN_INLINE_FUNCTION + static constexpr std::array return_strides(const __strides_storage_t& s) { + return std::array{s[Idxs]...}; + } + template MDSPAN_INLINE_FUNCTION static constexpr size_t __return_zero() { return 0; } @@ -218,6 +225,21 @@ struct layout_stride { // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348. using __impl = __deduction_workaround>; + static constexpr __strides_storage_t strides_storage(std::true_type) { + __strides_storage_t s{}; + + extents_type e; + index_type stride = 1; + for(int r = static_cast(extents_type::rank() - 1); r >= 0; r--) { + s[r] = stride; + stride *= e.extent(r); + } + + return s; + } + static constexpr __strides_storage_t strides_storage(std::false_type) { + return {}; + } //---------------------------------------------------------------------------- @@ -233,7 +255,21 @@ struct layout_stride { //-------------------------------------------------------------------------------- - MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + extents_type(), + __strides_storage_t(strides_storage(std::integral_constant 0)>{})) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + {} + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default; MDSPAN_TEMPLATE_REQUIRES( @@ -332,10 +368,10 @@ struct layout_stride { ) #endif MDSPAN_CONDITIONAL_EXPLICIT( - (!std::is_convertible::value) && - (detail::__is_mapping_of || - detail::__is_mapping_of || - detail::__is_mapping_of) + !(std::is_convertible::value && + (detail::__is_mapping_of || + detail::__is_mapping_of || + detail::__is_mapping_of)) ) // needs two () due to comma MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 mapping(StridedLayoutMapping const& other) noexcept // NOLINT(google-explicit-constructor) @@ -374,7 +410,7 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION constexpr std::array< index_type, extents_type::rank() > strides() const noexcept { - return __strides_storage(); + return __impl::return_strides(__strides_storage()); } MDSPAN_INLINE_FUNCTION @@ -393,8 +429,7 @@ struct layout_stride { class... Indices, /* requires */ ( sizeof...(Indices) == Extents::rank() && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/) + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -410,17 +445,37 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { - return required_span_size() == __get_size(extents(), std::make_index_sequence()); + if constexpr (extents_type::rank() == 0) + return true; + else { + index_type span_size = required_span_size(); + if (span_size == static_cast(0)) { + if constexpr (extents_type::rank() == 1) { + return stride(0) == 1; + } else { + rank_type r_largest = 0; + for (rank_type r = 1; r < extents_type::rank(); r++) { + if (stride(r) > stride(r_largest)) { + r_largest = r; + } + } + for (rank_type r = 0; r < extents_type::rank(); r++) { + if (extents().extent(r) == 0 && r != r_largest) { + return false; + } + } + return true; + } + } else { + return required_span_size() == __get_size(extents(), std::make_index_sequence()); + } + } } MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION - constexpr index_type stride(rank_type r) const noexcept -#if MDSPAN_HAS_CXX_20 - requires ( Extents::rank() > 0 ) -#endif - { + constexpr index_type stride(rank_type r) const noexcept { return __strides_storage()[r]; } @@ -444,10 +499,13 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const mapping& x, const StridedLayoutMapping& y) noexcept { bool strides_match = true; - for(rank_type r = 0; r < extents_type::rank(); r++) - strides_match = strides_match && (x.stride(r) == y.stride(r)); + if constexpr (extents_type::rank() > 0) { + using common_t = std::common_type_t; + for(rank_type r = 0; r < extents_type::rank(); r++) + strides_match = strides_match && (static_cast(x.stride(r)) == static_cast(y.stride(r))); + } return (x.extents() == y.extents()) && - (__impl::__OFFSET(y)== static_cast(0)) && + (__impl::__OFFSET(y) == static_cast(0)) && strides_match; } @@ -489,6 +547,17 @@ struct layout_stride { } #endif + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; }; diff --git a/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp index 6febe300215..d6ec49e65bf 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp @@ -55,6 +55,13 @@ class mdspan ReferenceType __callop(mdspan const& __self, const std::array& indices) noexcept { return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); } +#ifdef __cpp_lib_span + template + MDSPAN_FORCE_INLINE_FUNCTION static constexpr + ReferenceType __callop(mdspan const& __self, const std::span& indices) noexcept { + return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); + } +#endif }; public: @@ -109,9 +116,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && ((sizeof...(SizeTypes) == rank()) || (sizeof...(SizeTypes) == rank_dynamic())) && + (detail::are_valid_indices()) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) ) @@ -125,8 +131,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -142,8 +148,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -160,7 +166,7 @@ class mdspan (MDSPAN_INLINE_FUNCTION constexpr), mdspan, (data_handle_type p, const extents_type& exts), , /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type) && - _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type)) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const extents_type&)) ) : __members(std::move(p), __map_acc_pair_t(mapping_type(exts), accessor_type())) { } @@ -179,10 +185,14 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherAccessor, /* requires */ ( - _MDSPAN_TRAIT(std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping) && - _MDSPAN_TRAIT(std::is_constructible, accessor_type, OtherAccessor) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const typename OtherLayoutPolicy::template mapping&) && + _MDSPAN_TRAIT(std::is_constructible, accessor_type, const OtherAccessor&) ) ) + MDSPAN_CONDITIONAL_EXPLICIT( + !_MDSPAN_TRAIT(std::is_convertible, const typename OtherLayoutPolicy::template mapping&, mapping_type) || + !_MDSPAN_TRAIT(std::is_convertible, const OtherAccessor&, accessor_type) + ) MDSPAN_INLINE_FUNCTION constexpr mdspan(const mdspan& other) : __members(other.__ptr_ref(), __map_acc_pair_t(other.__mapping_ref(), other.__accessor_ref())) @@ -226,8 +236,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -240,8 +250,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -271,9 +281,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + extents_type::rank() == sizeof...(SizeTypes) && + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -285,8 +294,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -299,8 +308,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -311,7 +320,7 @@ class mdspan #endif // __cpp_lib_span #endif // MDSPAN_USE_PAREN_OPERATOR - MDSPAN_INLINE_FUNCTION constexpr size_t size() const noexcept { + MDSPAN_INLINE_FUNCTION constexpr size_type size() const noexcept { return __impl::__size(*this); }; @@ -346,13 +355,13 @@ class mdspan //-------------------------------------------------------------------------------- // [mdspan.basic.obs], mdspan observers of the mapping - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return mapping_type::is_always_unique(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return mapping_type::is_always_exhaustive(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return mapping_type::is_always_strided(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() { return mapping_type::is_always_unique(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() { return mapping_type::is_always_exhaustive(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() { return mapping_type::is_always_strided(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return __mapping_ref().is_unique(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return __mapping_ref().is_exhaustive(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return __mapping_ref().is_strided(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const { return __mapping_ref().is_unique(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const { return __mapping_ref().is_exhaustive(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const { return __mapping_ref().is_strided(); }; MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t r) const { return __mapping_ref().stride(r); }; private: @@ -374,7 +383,7 @@ class mdspan #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) MDSPAN_TEMPLATE_REQUIRES( class ElementType, class... SizeTypes, - /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_integral, SizeTypes) /* && ... */) && + /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, size_t) /* && ... */) && (sizeof...(SizeTypes) > 0) ) MDSPAN_DEDUCTION_GUIDE explicit mdspan(ElementType*, SizeTypes...) diff --git a/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp b/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp index 3950273a83d..bdc5925f715 100644 --- a/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp +++ b/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp @@ -103,8 +103,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) && (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) || container_is_array::value) && @@ -133,61 +133,29 @@ class mdarray { ) : map_(m), ctr_(container_is_array::construct(map_)) { } - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(const container_type& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(ctr) - { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (const container_type& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, const container_type& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(const container_type& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, const container_type& ctr) : map_(m), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(container_type&& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(std::move(ctr)) - { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (container_type&& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, container_type&& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(container_type&& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, container_type&& ctr) : map_(m), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer, /* requires */ ( @@ -229,7 +197,7 @@ class mdarray { _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, const container_type& ctr, const Alloc& a) : map_(exts), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -238,7 +206,7 @@ class mdarray { /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, const container_type& ctr, const Alloc& a) : map_(map), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -248,7 +216,7 @@ class mdarray { _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, container_type&& ctr, const Alloc& a) : map_(exts), ctr_(std::move(ctr), a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -257,7 +225,7 @@ class mdarray { /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, container_type&& ctr, const Alloc& a) : map_(map), ctr_(std::move(ctr), a) { assert(ctr_.size() >= map_.required_span_size()); } @@ -344,8 +312,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -356,8 +324,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -433,8 +401,9 @@ class mdarray { class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + mdspan_type) ) ) constexpr operator mdspan () { @@ -445,8 +414,9 @@ class mdarray { class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + const_mdspan_type) ) ) constexpr operator mdspan () const { diff --git a/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp b/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp index 58f38620ba1..89ba8202fb1 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp @@ -20,7 +20,6 @@ #include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace { template @@ -29,6 +28,7 @@ namespace { template struct __mdspan_is_integral_constant>: std::true_type {}; } + // Slice Specifier allowing for strides and compile time extent template struct strided_slice { @@ -36,14 +36,13 @@ struct strided_slice { using extent_type = ExtentType; using stride_type = StrideType; - OffsetType offset; - ExtentType extent; - StrideType stride; + _MDSPAN_NO_UNIQUE_ADDRESS OffsetType offset{}; + _MDSPAN_NO_UNIQUE_ADDRESS ExtentType extent{}; + _MDSPAN_NO_UNIQUE_ADDRESS StrideType stride{}; static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); }; -} // MDSPAN_IMPL_PROPOSED_NAMESPACE } // MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp index b9672b7f9ac..abddd0b59df 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp @@ -20,23 +20,21 @@ #include "submdspan_mapping.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { template MDSPAN_INLINE_FUNCTION constexpr auto submdspan(const mdspan &src, SliceSpecifiers... slices) { - const auto sub_mapping_offset = submdspan_mapping(src.mapping(), slices...); + const auto sub_submdspan_mapping_result = submdspan_mapping(src.mapping(), slices...); // NVCC has a problem with the deduction so lets figure out the type - using sub_mapping_t = std::remove_cv_t; + using sub_mapping_t = std::remove_cv_t; using sub_extents_t = typename sub_mapping_t::extents_type; using sub_layout_t = typename sub_mapping_t::layout_type; using sub_accessor_t = typename AccessorPolicy::offset_policy; return mdspan( - src.accessor().offset(src.data_handle(), sub_mapping_offset.offset), - sub_mapping_offset.mapping, + src.accessor().offset(src.data_handle(), sub_submdspan_mapping_result.offset), + sub_submdspan_mapping_result.mapping, sub_accessor_t(src.accessor())); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp index f56ce023f16..c3b2f78fb99 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp @@ -20,7 +20,6 @@ #include "strided_slice.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace detail { // Mapping from submapping ranks to srcmapping ranks @@ -319,5 +318,4 @@ constexpr auto submdspan_extents(const extents &src_exts, return detail::extents_constructor::next_extent( src_exts, slices...); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp index 48778d57e75..ca6948c9a9f 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp @@ -22,21 +22,15 @@ #include // index_sequence namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { //****************************************** // Return type of submdspan_mapping overloads //****************************************** -template struct mapping_offset { - Mapping mapping; +template struct submdspan_mapping_result { + _MDSPAN_NO_UNIQUE_ADDRESS LayoutMapping mapping{}; size_t offset; }; -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE namespace detail { -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::first_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::stride_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::inv_map_rank; - // constructs sub strides template MDSPAN_INLINE_FUNCTION @@ -98,17 +92,15 @@ struct preserve_layout_left_mapping, SubRank, #pragma diag_suppress = implicit_return_from_non_void_function #endif // Actual submdspan mapping call -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_left::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; +layout_left::mapping::submdspan_mapping_impl(SliceSpecifiers... slices) const { // compute sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // figure out sub layout type @@ -121,18 +113,18 @@ submdspan_mapping(const layout_left::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_left case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -140,7 +132,7 @@ submdspan_mapping(const layout_left::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -207,17 +199,15 @@ struct preserve_layout_right_mapping, SubRank, #pragma diagnostic push #pragma diag_suppress = implicit_return_from_non_void_function #endif -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_right::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - +layout_right::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { // get sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // determine new layout type @@ -230,18 +220,18 @@ submdspan_mapping(const layout_right::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_right case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -249,7 +239,7 @@ submdspan_mapping(const layout_right::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -270,23 +260,22 @@ submdspan_mapping(const layout_right::mapping &src_mapping, //********************************** // layout_stride submdspan_mapping //********************************* -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_stride::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); +layout_stride::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); using dst_mapping_t = typename layout_stride::template mapping; - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -294,6 +283,7 @@ submdspan_mapping(const layout_stride::mapping &src_mapping, #else std::tuple(detail::stride_of(slices)...))), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } + } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp new file mode 100644 index 00000000000..a8014867923 --- /dev/null +++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -0,0 +1,793 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "layout_padded_fwd.hpp" +#include "../__p0009_bits/dynamic_extent.hpp" +#include "../__p0009_bits/extents.hpp" +#include "../__p0009_bits/mdspan.hpp" +#include "../__p0009_bits/layout_left.hpp" +#include "../__p0009_bits/layout_right.hpp" +#include "../__p0009_bits/layout_stride.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +namespace detail { +template +MDSPAN_INLINE_FUNCTION +constexpr _T +find_next_multiple(_T alignment, _T offset) +{ + if ( alignment == 0 ) { + return _T(0); + } else { + return ( ( offset + alignment - 1 ) / alignment) * alignment; + } +} + +template +MDSPAN_INLINE_FUNCTION constexpr size_t get_actual_static_padding_value() { + constexpr auto rank = _ExtentsType::rank(); + + if constexpr (rank <= typename _ExtentsType::rank_type(1)) { + return 0; + } else if constexpr (_PaddingValue != dynamic_extent && + _ExtentsType::static_extent(_ExtentToPadIdx) != + dynamic_extent) { + static_assert( + (_PaddingValue != 0) || + (_ExtentsType::static_extent(_ExtentToPadIdx) == 0), + "padding stride can be 0 only if " + "extents_type::static_extent(extent-to-pad) is 0 or dynamic_extent"); + return find_next_multiple(_PaddingValue, + _ExtentsType::static_extent(_ExtentToPadIdx)); + } else { + return dynamic_extent; + } +} + +template +struct static_array_type_for_padded_extent +{ + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, + detail::get_actual_static_padding_value()>; +}; + +template +struct static_array_type_for_padded_extent<_PaddingValue, _Extents, + _ExtentToPadIdx, Rank, std::enable_if_t> { + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = + ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, 0>; +}; + +template +struct padded_extent { + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using static_array_type = typename static_array_type_for_padded_extent< + padding_value, _Extents, _ExtentToPadIdx, _Extents::rank()>::type; + + static constexpr auto static_value() { return static_array_type::static_value(0); } + + MDSPAN_INLINE_FUNCTION + static constexpr static_array_type + init_padding(const _Extents &exts) { + if constexpr ((_Extents::rank() > 1) && (padding_value == dynamic_extent)) { + return {exts.extent(_ExtentToPadIdx)}; + } else { + return init_padding(exts, padding_value); + } + } + + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Extents &exts, + [[maybe_unused]] index_type pv) { + if constexpr (_Extents::rank() > 1) { + return {find_next_multiple(pv, + exts.extent(_ExtentToPadIdx))}; + } else { + return {}; + } + } + + template + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Mapping &other_mapping, + std::integral_constant) { + if constexpr (_Extents::rank() > 1) { + return {other_mapping.stride(_PaddingStrideIdx)}; + } else { + return {}; + } + } +}; +} // namespace detail + +template +template +class layout_left_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_left_padded; + +#ifndef MDSPAN_INTERNAL_TEST +private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "out of bounds access for rank 0"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + index_type indices[] = {static_cast(index_offsets)...}; + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 + index_type res = 0; + ((res = indices[extents_type::rank() - 1 - Ranks] + + ((extents_type::rank() - 1 - Ranks) == extent_to_pad_idx + ? padded_stride.value(0) + : exts.extent(extents_type::rank() - 1 - Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type& ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) + {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, dynamic_padding_value)), exts(ext) + { + assert((padding_value == dynamic_extent) || (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_left::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + constexpr + mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr + mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) + { + s[r] = value; + value *= exts.extent(r); + } + s[extents_type::rank() - 1] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = padded_stride.value(0); + for (rank_type r = 1; r < extents_type::rank(); ++r) { + value *= exts.extent(r); + } + return value; + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == 0) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; + +template +template +class layout_right_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_right_padded; + +#ifndef MDSPAN_INTERNAL_TEST + private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "if padding stride is 0, static_extent(extent-to-pad-rank) must also be 0 or dynamic_extent"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 + index_type res = 0; + ((res = static_cast(index_offsets) + + (Ranks == extent_to_pad_idx ? padded_stride.value(0) + : exts.extent(Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, static_cast(dynamic_padding_value))), + exts(ext) { + assert((padding_value == dynamic_extent) || + (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_right::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + {} + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) + { + s[r] = value; + value *= exts.extent(r); + } + s[0] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = 1; + for (rank_type r = 0; r < extent_to_pad_idx; ++r) + { + value *= exts.extent(r); + } + return value * padded_stride.value(0); + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == extents_type::rank() - 1) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; +} +} diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp new file mode 100644 index 00000000000..945f091a2dc --- /dev/null +++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "../__p0009_bits/dynamic_extent.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +template +struct layout_left_padded { + template + class mapping; +}; + +template +struct layout_right_padded { + template + class mapping; +}; + +namespace detail { +// The layout_padded_constants structs are only useful if rank > 1, otherwise they may wrap +template +struct layout_padded_constants; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = 1; + static constexpr rank_type extent_to_pad_idx = 0; +}; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = _ExtentsType::rank() - 2; + static constexpr rank_type extent_to_pad_idx = _ExtentsType::rank() - 1; +}; + +template +struct is_layout_left_padded : std::false_type {}; + +template +struct is_layout_left_padded> : std::true_type {}; + +template +struct is_layout_left_padded_mapping : std::false_type {}; + +template +struct is_layout_left_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +struct is_layout_right_padded : std::false_type {}; + +template +struct is_layout_right_padded> : std::true_type {}; + +template +struct is_layout_right_padded_mapping : std::false_type {}; + +template +struct is_layout_right_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +constexpr void check_padded_layout_converting_constructor_mandates() +{ + if constexpr (_LayoutExtentsType::rank() > 1) { + using extents_type = typename _PaddedLayoutMappingType::extents_type; + constexpr auto padding_value = _PaddedLayoutMappingType::padding_value; + constexpr auto idx = layout_padded_constants::extent_to_pad_idx; + if constexpr ((_LayoutExtentsType::static_extent(idx) != dynamic_extent) && + (extents_type::static_extent(idx) != dynamic_extent) && + (padding_value != dynamic_extent)) { + if constexpr (padding_value == 0) { + static_assert(_LayoutExtentsType::static_extent(idx) == 0); + } else { + static_assert( + _LayoutExtentsType::static_extent(idx) % padding_value == 0); + } + } + } +} + +template +constexpr void check_padded_layout_converting_constructor_preconditions([[maybe_unused]] const _OtherMapping &other_mapping) { + if constexpr (_ExtentsType::rank() > 1) { + constexpr auto padded_stride_idx = + layout_padded_constants::padded_stride_idx; + constexpr auto extent_to_pad_idx = layout_padded_constants::extent_to_pad_idx; + assert(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx)); + } +} +} +} +} diff --git a/tpls/mdspan/include/mdspan/mdspan.hpp b/tpls/mdspan/include/mdspan/mdspan.hpp index b440873526a..ac72a1a4e64 100644 --- a/tpls/mdspan/include/mdspan/mdspan.hpp +++ b/tpls/mdspan/include/mdspan/mdspan.hpp @@ -35,6 +35,7 @@ #include "../experimental/__p0009_bits/layout_right.hpp" #include "../experimental/__p0009_bits/macros.hpp" #if MDSPAN_HAS_CXX_17 +#include "../experimental/__p2642_bits/layout_padded.hpp" #include "../experimental/__p2630_bits/submdspan.hpp" #endif From 6db04b3b5803fb7686475effe2b6af071b885697 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Thu, 7 Mar 2024 13:10:11 -0600 Subject: [PATCH 327/432] CTAD (deduction guides) for RangePolicy (#6850) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Removed the check for NVCC compiler version when testing CTAD, as decltype(RangePolicy(...)) isn't compiling under any version of Cuda. * Added in the explicit deduction guides for RangePolicy: ā€¢Ā Correctness when passing in an execution space ā€¢Ā Workaround for nvcc as RangePolicy<...> doesn't have any template parameters that can be deduced, so gcc/clang assume that a matching ctor in the primary template deduces to RangePolicy<> while nvcc assumes it is a bug. Rewrote the tests to be of the form: [[maybe_unused]] static inline auto rpxy = RangePolicy(x, y); static_assert(is_same_v, decltype(rpxy)>); This form avoids the most vexing parse.o, and seems to pass on all compilers. Added tests for SomeExecutionSpace which is guaranteed not to be DefaultExecutionSpace. Added calls to ImplicitlyConvertibleToDefaultExecutionSpace::operator DefaultExecutionSpace() and SomeExecutionSpace::concurrency() to avoid maybe_unused errors under some compilers. Notes: The default constructed CTAD RangePolicy uses list initialization syntax (curlies) instead of parentheses to get around a gcc 8.2 compiler bug. nestodes uses "Kokkos::DefaultExecutionSpace" instead of "auto" in order to trigger the implicit conversion. --- core/src/Kokkos_ExecPolicy.hpp | 15 +++ core/unit_test/TestRangePolicyCTAD.cpp | 144 +++++++++++++++++-------- 2 files changed, 116 insertions(+), 43 deletions(-) diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index d0051560398..5f251eeb26a 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -328,6 +328,21 @@ class RangePolicy : public Impl::PolicyTraits { }; }; +RangePolicy()->RangePolicy<>; + +RangePolicy(int64_t, int64_t)->RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; + +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) + ->RangePolicy<>; + +template >> +RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; + +template >> +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; + } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/core/unit_test/TestRangePolicyCTAD.cpp b/core/unit_test/TestRangePolicyCTAD.cpp index 6a2c0f507be..20288e2b40a 100644 --- a/core/unit_test/TestRangePolicyCTAD.cpp +++ b/core/unit_test/TestRangePolicyCTAD.cpp @@ -17,76 +17,134 @@ #include #include "Kokkos_Core_fwd.hpp" -#if !defined(KOKKOS_COMPILER_NVCC) || KOKKOS_COMPILER_NVCC >= 1120 - namespace { -template -using PolicyMaker = decltype(::Kokkos::RangePolicy(std::declval()...)); - -template -inline constexpr bool IsSamePolicy = - std::is_same_v>; +struct TestRangePolicyCTAD { + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; -#define KOKKOS_TEST_RANGE_POLICY(...) static_assert(IsSamePolicy<__VA_ARGS__>) + [[maybe_unused]] static int concurrency() { return 0; } + }; + static_assert(Kokkos::is_execution_space_v); -struct TestRangePolicyCTAD { struct ImplicitlyConvertibleToDefaultExecutionSpace { - operator Kokkos::DefaultExecutionSpace() const { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { return Kokkos::DefaultExecutionSpace(); } }; static_assert(!Kokkos::is_execution_space_v< ImplicitlyConvertibleToDefaultExecutionSpace>); - using des = Kokkos::DefaultExecutionSpace; - using nes = ImplicitlyConvertibleToDefaultExecutionSpace; - using i64 = int64_t; - using i32 = int32_t; - using cs = Kokkos::ChunkSize; + [[maybe_unused]] static inline auto i64 = int64_t(); + [[maybe_unused]] static inline auto i32 = int32_t(); + [[maybe_unused]] static inline auto cs = Kokkos::ChunkSize(0); + [[maybe_unused]] static inline auto des = Kokkos::DefaultExecutionSpace(); + [[maybe_unused]] static inline auto nes = + ImplicitlyConvertibleToDefaultExecutionSpace(); + [[maybe_unused]] static inline auto ses = SomeExecutionSpace(); // RangePolicy() - // Guard against GGC 8.4 bug - // error: cannot deduce template arguments for ā€˜RangePolicyā€™ from () - // error: template argument 2 is invalid -#if !defined(KOKKOS_COMPILER_GNU) || (KOKKOS_COMPILER_GNU > 900) - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<> /*, no argument */); -#endif + [[maybe_unused]] static inline auto rp = Kokkos::RangePolicy{}; + static_assert(std::is_same_v, decltype(rp)>); // RangePolicy(index_type, index_type) - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i64); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i32); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i64); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i32); + [[maybe_unused]] static inline auto rpi64i64 = Kokkos::RangePolicy(i64, i64); + static_assert(std::is_same_v, decltype(rpi64i64)>); + + [[maybe_unused]] static inline auto rpi64i32 = Kokkos::RangePolicy(i64, i32); + static_assert(std::is_same_v, decltype(rpi64i32)>); + + [[maybe_unused]] static inline auto rpi32i64 = Kokkos::RangePolicy(i32, i64); + static_assert(std::is_same_v, decltype(rpi32i64)>); - // RangePolicy(index_type, index_type, Args...) + [[maybe_unused]] static inline auto rpi32i32 = Kokkos::RangePolicy(i32, i32); + static_assert(std::is_same_v, decltype(rpi32i32)>); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i64, cs); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i64, i32, cs); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i64, cs); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, i32, i32, cs); + // RangePolicy(index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpi64i64cs = + Kokkos::RangePolicy(i64, i64, cs); + static_assert(std::is_same_v, decltype(rpi64i64cs)>); + + [[maybe_unused]] static inline auto rpi64i32cs = + Kokkos::RangePolicy(i64, i32, cs); + static_assert(std::is_same_v, decltype(rpi64i32cs)>); + + [[maybe_unused]] static inline auto rpi32i64cs = + Kokkos::RangePolicy(i32, i64, cs); + static_assert(std::is_same_v, decltype(rpi32i64cs)>); + + [[maybe_unused]] static inline auto rpi32i32cs = + Kokkos::RangePolicy(i32, i32, cs); + static_assert(std::is_same_v, decltype(rpi32i32cs)>); // RangePolicy(execution_space, index_type, index_type) - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i64, i64); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i32, i32); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i64, i64); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i32, i32); + [[maybe_unused]] static inline auto rpdesi64i64 = + Kokkos::RangePolicy(des, i64, i64); + static_assert(std::is_same_v, decltype(rpdesi64i64)>); + + [[maybe_unused]] static inline auto rpdesi32i32 = + Kokkos::RangePolicy(des, i32, i32); + static_assert(std::is_same_v, decltype(rpdesi32i32)>); + + [[maybe_unused]] static inline auto rpnesi64i64 = + Kokkos::RangePolicy(nes, i64, i64); + static_assert(std::is_same_v, decltype(rpnesi64i64)>); - // RangePolicy(execution_space, index_type, index_type, Args...) + [[maybe_unused]] static inline auto rpnesi32i32 = + Kokkos::RangePolicy(nes, i32, i32); + static_assert(std::is_same_v, decltype(rpnesi32i32)>); + + [[maybe_unused]] static inline auto rpsesi64i64 = + Kokkos::RangePolicy(ses, i64, i64); + static_assert(std::is_same_v, + decltype(rpsesi64i64)>); + + [[maybe_unused]] static inline auto rpsesi32i32 = + Kokkos::RangePolicy(ses, i32, i32); + static_assert(std::is_same_v, + decltype(rpsesi32i32)>); + + // RangePolicy(execution_space, index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpdesi64i64cs = + Kokkos::RangePolicy(des, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpdesi64i64cs)>); + + [[maybe_unused]] static inline auto rpdesi32i32cs = + Kokkos::RangePolicy(des, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpdesi32i32cs)>); + + [[maybe_unused]] static inline auto rpnesi64i64cs = + Kokkos::RangePolicy(nes, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpnesi64i64cs)>); + + [[maybe_unused]] static inline auto rpnesi32i32cs = + Kokkos::RangePolicy(nes, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpnesi32i32cs)>); + + [[maybe_unused]] static inline auto rpsesi64i64cs = + Kokkos::RangePolicy(ses, i64, i64, cs); + static_assert(std::is_same_v, + decltype(rpsesi64i64cs)>); + + [[maybe_unused]] static inline auto rpsesi32i32cs = + Kokkos::RangePolicy(ses, i32, i32, cs); + static_assert(std::is_same_v, + decltype(rpsesi32i32cs)>); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i64, i64, cs); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, des, i32, i32, cs); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i64, i64, cs); - KOKKOS_TEST_RANGE_POLICY(Kokkos::RangePolicy<>, nes, i32, i32, cs); }; // TestRangePolicyCTAD struct // To eliminate maybe_unused warning on some compilers -[[maybe_unused]] const Kokkos::DefaultExecutionSpace des = + +[[maybe_unused]] const Kokkos::DefaultExecutionSpace nestodes = TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); -} // namespace +[[maybe_unused]] const auto sesconcurrency = + TestRangePolicyCTAD::ses.concurrency(); -#endif +} // namespace From cfc260ac0aa36b42626d51093e1098aefbe79da7 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Fri, 8 Mar 2024 13:28:00 -0600 Subject: [PATCH 328/432] CTAD (deduction guides) for MDRangePolicy (#5516) * CTAD for MDRangePolicy * WIP: Moved CTAD tests from TestMDRangePolicyConstructors.hpp to TestMDRangePolicyCTAD.cpp * WIP Added [[maybe_unused]] to all static inline variables in CTAD tests * Removed TEST_EXECSPACE from the MDRangePolicy CTAD tests (as that isn't needed for compile time only tests) and associated cleanup * Marked struct ImplicitlyConvertibleToDefaultExecutionSpace::operator Kokkos::DefaultExecutionSpace() const; as [[maybe_unused]] to make it consistent with the other policy CTAD tests (in other PRs) * Added a Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on ImplicitlyConvertibleToDefaultExecutionSpace::operator Kokkos::DefaultExecutionSpace() const by defining it and implicitly calling it in another [[maybe_unused]] static inline variable. * Workaround for HIP-ROCm-5.2 "declared but never referenced" * Added MDRangePolicy CTAD tests for initializer_lists Fixed a comment --- core/src/KokkosExp_MDRangePolicy.hpp | 75 +++++++++++- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestMDRangePolicyCTAD.cpp | 138 +++++++++++++++++++++++ 3 files changed, 209 insertions(+), 5 deletions(-) create mode 100644 core/unit_test/TestMDRangePolicyCTAD.cpp diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp index 2df274b81f9..297b1fadee9 100644 --- a/core/src/KokkosExp_MDRangePolicy.hpp +++ b/core/src/KokkosExp_MDRangePolicy.hpp @@ -155,9 +155,20 @@ TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { // multi-dimensional iteration pattern template -struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { - using traits = Kokkos::Impl::PolicyTraits; - using range_policy = RangePolicy; +struct MDRangePolicy; + +// Note: If MDRangePolicy has a primary template, implicit CTAD (deduction +// guides) are generated -> MDRangePolicy<> by some compilers, which is +// incorrect. By making it a template specialization instead, no implicit CTAD +// is generated. This works because there has to be at least one property +// specified (which is Rank<...>); otherwise, we'd get the static_assert +// "Kokkos::Error: MD iteration pattern not defined". This template +// specialization uses in all places for correctness. +template +struct MDRangePolicy + : public Kokkos::Impl::PolicyTraits { + using traits = Kokkos::Impl::PolicyTraits; + using range_policy = RangePolicy; typename traits::execution_space m_space; @@ -166,8 +177,8 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { typename traits::schedule_type, typename traits::index_type>; using execution_policy = - MDRangePolicy; // needed for is_execution_space - // interrogation + MDRangePolicy; // needed for is_execution_policy + // interrogation template friend struct MDRangePolicy; @@ -377,6 +388,60 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } }; +template +MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; + +template +MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], + const TT (&)[TN]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&, Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&, + Array const&) + ->MDRangePolicy>; + } // namespace Kokkos #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 5e56c45b8ff..f1ce00ce4a3 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -92,6 +92,7 @@ SET(COMPILE_ONLY_SOURCES TestViewRank.cpp TestViewTypeTraits.cpp TestTypeList.cpp + TestMDRangePolicyCTAD.cpp view/TestExtentsDatatypeConversion.cpp ) diff --git a/core/unit_test/TestMDRangePolicyCTAD.cpp b/core/unit_test/TestMDRangePolicyCTAD.cpp new file mode 100644 index 00000000000..d433590f2a8 --- /dev/null +++ b/core/unit_test/TestMDRangePolicyCTAD.cpp @@ -0,0 +1,138 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestMDRangePolicyCTAD { + template + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int t[5]; + [[maybe_unused]] static inline int64_t tt[5]; + [[maybe_unused]] static inline Kokkos::Array a; + [[maybe_unused]] static inline Kokkos::Array aa; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 "declared but never referenced" + TestMDRangePolicyCTAD() { + maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes); + } + + // MDRangePolicy with C array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, t, t))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, t, t))>); + + // MDRangePolicy with Kokkos::initializer_list parameters + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}))>); + + [[maybe_unused]] static inline int64_t i64; + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + des, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(ses, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + // MDRangePolicy with Kokkos::Array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a, aa))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a))>); + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a, aa))>); +}; + +} // namespace From 35ad698e03594745f07728df0bcdaa49c5d0e7d4 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 11 Mar 2024 09:56:36 -0400 Subject: [PATCH 329/432] Add support for rocThrust in sort when using HIP (#6793) * Use rocthrust in sort when using HIP * Fix reviewer's comments * Make sure that we don't compile Kokkos for every supported architecture when enabling rocthrust * Export ROCTHRUST as a Kokkos dependency --- .jenkins | 4 +- algorithms/src/CMakeLists.txt | 3 +- .../src/sorting/impl/Kokkos_SortImpl.hpp | 42 +++++++++++++++++++ cmake/KokkosCore_config.h.in | 1 + cmake/Modules/FindTPLROCTHRUST.cmake | 15 +++++++ cmake/kokkos_tpls.cmake | 3 ++ core/src/HIP/Kokkos_HIP_Instance.cpp | 7 ++++ 7 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 cmake/Modules/FindTPLROCTHRUST.cmake diff --git a/.jenkins b/.jenkins index 5a859420fd3..ae3bffd92d7 100644 --- a/.jenkins +++ b/.jenkins @@ -139,7 +139,7 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete' label 'rocm-docker ' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } @@ -181,7 +181,7 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6-complete' label 'rocm-docker' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } diff --git a/algorithms/src/CMakeLists.txt b/algorithms/src/CMakeLists.txt index 16957789472..7cb47a316b5 100644 --- a/algorithms/src/CMakeLists.txt +++ b/algorithms/src/CMakeLists.txt @@ -30,5 +30,4 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms ${CMAKE_CURRENT_SOURCE_DIR} ) - - +KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) diff --git a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index d87ab09e772..4c174b5fda9 100644 --- a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -63,6 +63,11 @@ #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) #include #include @@ -184,6 +189,26 @@ void sort_cudathrust(const Cuda& space, } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_rocthrust(const HIP& space, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + const auto exec = thrust::hip::par.on(space.hip_stream()); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + thrust::sort(exec, first, last, + std::forward(maybeComparator)...); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_onedpl(const Kokkos::Experimental::SYCL& space, @@ -274,6 +299,14 @@ void sort_device_view_without_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_without_comparator( + const HIP& exec, const Kokkos::View& view) { + sort_rocthrust(exec, view); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( @@ -320,6 +353,15 @@ void sort_device_view_with_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_with_comparator( + const HIP& exec, const Kokkos::View& view, + const ComparatorType& comparator) { + sort_rocthrust(exec, view, comparator); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 33ebfd266b4..2df0f6c5205 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -58,6 +58,7 @@ #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH #cmakedefine KOKKOS_ENABLE_ONEDPL +#cmakedefine KOKKOS_ENABLE_ROCTHRUST #cmakedefine KOKKOS_ARCH_ARMV80 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX diff --git a/cmake/Modules/FindTPLROCTHRUST.cmake b/cmake/Modules/FindTPLROCTHRUST.cmake new file mode 100644 index 00000000000..ff75e3a3eeb --- /dev/null +++ b/cmake/Modules/FindTPLROCTHRUST.cmake @@ -0,0 +1,15 @@ +# ROCm 5.6 and earlier set AMDGPU_TARGETS and GPU_TARGETS to all the supported +# architectures. Therefore, we end up compiling Kokkos for all the supported +# architecture. Starting with ROCm 5.7 AMDGPU_TARGETS and GPU_TARGETS are empty. +# It is the user's job to set the variables. Since we are injecting the +# architecture flag ourselves, we can let the variables empty. To replicate the +# behavior of ROCm 5.7 and later for earlier version of ROCm we set +# AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If +# the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. +SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +FIND_PACKAGE(rocthrust REQUIRED) +KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) + +# Export ROCTHRUST as a Kokkos dependency +KOKKOS_EXPORT_CMAKE_TPL(ROCTHRUST) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index f80d724f7f4..c9ebcd6b94b 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -40,6 +40,8 @@ ELSE() SET(ROCM_DEFAULT OFF) ENDIF() KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) +KOKKOS_TPL_OPTION(ROCTHRUST ${KOKKOS_ENABLE_HIP}) + IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) SET(ONEDPL_DEFAULT ON) ELSE() @@ -83,6 +85,7 @@ IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) ENDIF() KOKKOS_IMPORT_TPL(LIBQUADMATH) +KOKKOS_IMPORT_TPL(ROCTHRUST) IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 74bab397429..22c0db047f6 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -90,6 +90,13 @@ void HIPInternal::print_configuration(std::ostream &s) const { << '\n'; #endif + s << "macro KOKKOS_ENABLE_ROCTHRUST : " +#if defined(KOKKOS_ENABLE_ROCTHRUST) + << "defined\n"; +#else + << "undefined\n"; +#endif + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); From e5126e929450f52dfc18f7bd5b7b33fc221e5b48 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 11 Mar 2024 09:34:22 -0400 Subject: [PATCH 330/432] Add HIP specialization for sort-by-key --- .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index 5dc7047dde3..36deccdfb1e 100644 --- a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -59,6 +59,11 @@ #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) && \ (ONEDPL_VERSION_MAJOR > 2022 || \ (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)) @@ -117,6 +122,26 @@ void sort_by_key_cudathrust( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_rocthrust( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::hip::par.on(exec.hip_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template inline constexpr bool sort_on_device_v = @@ -272,6 +297,17 @@ void sort_by_key_device_view_without_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_rocthrust(exec, keys, values); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template @@ -317,6 +353,18 @@ void sort_by_key_device_view_with_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_rocthrust(exec, keys, values, comparator); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template From 3a765351c9aa01aa0d4a1bca0102f45a54b13dfe Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 11 Mar 2024 10:42:08 -0400 Subject: [PATCH 331/432] Fix unused variable warning in TestMDRangePolicyCTAD.cpp --- core/unit_test/TestMDRangePolicyCTAD.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestMDRangePolicyCTAD.cpp b/core/unit_test/TestMDRangePolicyCTAD.cpp index d433590f2a8..b2c3d021c35 100644 --- a/core/unit_test/TestMDRangePolicyCTAD.cpp +++ b/core/unit_test/TestMDRangePolicyCTAD.cpp @@ -45,6 +45,7 @@ struct TestMDRangePolicyCTAD { [[maybe_unused]] static inline int64_t tt[5]; [[maybe_unused]] static inline Kokkos::Array a; [[maybe_unused]] static inline Kokkos::Array aa; + [[maybe_unused]] static inline int64_t i64; // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on // ImplicitlyConvertibleToDefaultExecutionSpace::operator @@ -54,7 +55,7 @@ struct TestMDRangePolicyCTAD { // Workaround for HIP-ROCm-5.2 "declared but never referenced" TestMDRangePolicyCTAD() { - maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes); + maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes, i64); } // MDRangePolicy with C array parameters @@ -83,7 +84,6 @@ struct TestMDRangePolicyCTAD { decltype(Kokkos::MDRangePolicy( {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}))>); - [[maybe_unused]] static inline int64_t i64; static_assert(std::is_same_v>, decltype(Kokkos::MDRangePolicy( {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, From a2b64e0e8e78fe4a257f32a871984997d48305f3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 11 Mar 2024 15:24:38 -0400 Subject: [PATCH 332/432] Improve message on view out of bounds access and always abort (#6861) * Add test for view out-of-bounds access * [DO NOT MERGE] enable bound checking in view accesses unconditionally * Abort on out-of-bounds access on the host side instead of throwing * Improve out-of-bounds error message * Add test for printing multidimensional index * Try to get offsetview to compile * Do not bother with the label on the device side * Update test to reflect that device does not print the label * Fixup OffsetView out-of-bounds on device * Revert "Update test to reflect that device does not print the label" This reverts commit 7dbecbf635386556d3557c78a495970b439a3e05. * Revert "Do not bother with the label on the device side" This reverts commit 40348e88d265f7dfaf42354db5be18db21d55808. * Fixup UNMANAGED -> UNAVAILABLE * Only enable the test when debug bound checking is enabled * Revert "[DO NOT MERGE] enable bound checking in view accesses unconditionally" This reverts commit ff9d411fa0e8898f3db60055a12f222e9836fc57. * Fix typo formated -> formatted * Drop unecesary cast to void and Kokkos::Impl:: qualification * Add test with mixed integer types * Improve function name check_bounds -> within_range * Prefer right fold per review * Per review drop immediately invoked lambda trick on the host side * Silent warnings about tracker variable not being used on the device --- containers/src/Kokkos_OffsetView.hpp | 11 +- core/src/impl/Kokkos_ViewMapping.hpp | 154 +++++++--------- core/unit_test/CMakeLists.txt | 1 + core/unit_test/TestViewOutOfBoundsAccess.hpp | 175 +++++++++++++++++++ 4 files changed, 243 insertions(+), 98 deletions(-) create mode 100644 core/unit_test/TestViewOutOfBoundsAccess.hpp diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 92aead28784..91a7e4a9273 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -124,15 +124,8 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( args...); Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if it is not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ - if (tracker.has_record()) { - Kokkos::Impl::operator_bounds_error_on_device(map); - } else { Kokkos::abort("OffsetView bounds error"); })) + KOKKOS_IF_ON_DEVICE( + (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) } } diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index a2b41d98a91..3217c76e380 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -3537,110 +3537,86 @@ class ViewMapping< namespace Kokkos { namespace Impl { -template -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType&) { - return true; +template +KOKKOS_FUNCTION bool within_range(Map const& map, + std::index_sequence, + Indices... indices) { + return (((std::size_t)indices < map.extent(Enumerate)) && ...); } -template -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType& map, - const iType& i, - Args... args) { - return (size_t(i) < map.extent(R)) && - view_verify_operator_bounds(map, args...); +template +KOKKOS_FUNCTION constexpr char* append_formatted_multidimensional_index( + char* dest, Indices... indices) { + char* d = dest; + strcat(d, "["); + ( + [&] { + d += strlen(d); + to_chars_i(d, + d + 20, // 20 digits ought to be enough + indices); + strcat(d, ","); + }(), + ...); + d[strlen(d) - 1] = ']'; // overwrite trailing comma + return dest; } -template -inline void view_error_operator_bounds(char*, int, const MapType&) {} - -template -inline void view_error_operator_bounds(char* buf, int len, const MapType& map, - const iType& i, Args... args) { - const int n = snprintf( - buf, len, " %ld < %ld %c", static_cast(i), - static_cast(map.extent(R)), (sizeof...(Args) ? ',' : ')')); - view_error_operator_bounds(buf + n, len - n, map, args...); +template +KOKKOS_FUNCTION void print_extents(char* dest, Map const& map, + std::index_sequence) { + append_formatted_multidimensional_index(dest, map.extent(Enumerate)...); } -/* Check #3: is the View managed as determined by the MemoryTraits? */ -template -struct OperatorBoundsErrorOnDevice; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const&) { Kokkos::abort("View bounds error"); } -}; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const& map) { - SharedAllocationHeader const* const header = - SharedAllocationHeader::get_header( - static_cast(map.data())); - char const* const label = header->label(); - enum { LEN = 128 }; - char msg[LEN]; - char const* const first_part = "View bounds error of view "; - char* p = msg; - char* const end = msg + LEN - 1; - for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - *p = '\0'; - Kokkos::abort(msg); - } -}; - -/* Check #2: does the ViewMapping have the printable_label_typedef defined? - See above that only the non-specialized standard-layout ViewMapping has - this defined by default. - The existence of this alias indicates the existence of MapType::is_managed - */ template using printable_label_typedef_t = typename T::printable_label_typedef; -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const&) { - Kokkos::abort("View bounds error"); -} - -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const& map) { - OperatorBoundsErrorOnDevice::run(map); -} - template KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds( Kokkos::Impl::ViewTracker const& tracker, const MapType& map, Args... args) { - if (!view_verify_operator_bounds<0>(map, args...)) { + if (!within_range(map, std::make_index_sequence(), + args...)) { + char err[256] = ""; + strcat(err, "Kokkos::View ERROR: out of bounds access"); + strcat(err, " label=(\""); KOKKOS_IF_ON_HOST( - (enum {LEN = 1024}; char buffer[LEN]; - const std::string label = - tracker.m_tracker.template get_label(); - int n = snprintf(buffer, LEN, "View bounds error of view %s (", - label.c_str()); - view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if its not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ if (tracker.m_tracker.has_record()) { - operator_bounds_error_on_device(map); - } else { Kokkos::abort("View bounds error"); })) + strncat(err, tracker.m_tracker.template get_label().c_str(), + 128); + } else { strcat(err, "**UNMANAGED**"); }) + KOKKOS_IF_ON_DEVICE([&] { + // Check #1: is there a SharedAllocationRecord? (we won't use it, but + // if its not there then there isn't a corresponding + // SharedAllocationHeader containing a label). This check should cover + // the case of Views that don't have the Unmanaged trait but were + // initialized by pointer. + if (!tracker.m_tracker.has_record()) { + strcat(err, "**UNMANAGED**"); + return; + } + // Check #2: does the ViewMapping have the printable_label_typedef + // defined? See above that only the non-specialized standard-layout + // ViewMapping has this defined by default. The existence of this + // alias indicates the existence of MapType::is_managed + if constexpr (is_detected_v) { + // Check #3: is the View managed as determined by the MemoryTraits? + if constexpr (MapType::is_managed != 0) { + SharedAllocationHeader const* const header = + SharedAllocationHeader::get_header( + static_cast(map.data())); + char const* const label = header->label(); + strcat(err, label); + return; + } + strcat(err, "**UNAVAILABLE**"); + } + }();) + strcat(err, "\") with indices "); + append_formatted_multidimensional_index(err, args...); + strcat(err, " but extents "); + print_extents(err, map, std::make_index_sequence()); + Kokkos::abort(err); } } diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index f1ce00ce4a3..6dfb7505c5d 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -245,6 +245,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewMapping_subview ViewMemoryAccessViolation ViewOfClass + ViewOutOfBoundsAccess ViewResize WorkGraph WithoutInitializing diff --git a/core/unit_test/TestViewOutOfBoundsAccess.hpp b/core/unit_test/TestViewOutOfBoundsAccess.hpp new file mode 100644 index 00000000000..2716856c1fc --- /dev/null +++ b/core/unit_test/TestViewOutOfBoundsAccess.hpp @@ -0,0 +1,175 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include + +namespace { + +TEST(TEST_CATEGORY, append_formatted_multidimensional_index) { + using Kokkos::Impl::append_formatted_multidimensional_index; + { + char buffer[64] = "my prefix "; + append_formatted_multidimensional_index(buffer, 1); + EXPECT_STREQ(buffer, "my prefix [1]"); + } + { + char buffer[64] = "I was here"; + append_formatted_multidimensional_index(buffer, 1, 2, 3); + EXPECT_STREQ(buffer, "I was here[1,2,3]"); + } + { + char buffer[64] = "with mixed integer types "; + append_formatted_multidimensional_index(buffer, 1u, -2); + EXPECT_STREQ(buffer, "with mixed integer types [1,-2]"); + } +} + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + +template +struct TestViewOutOfBoundAccess { + View v; + static constexpr auto rank = View::rank; + + template + KOKKOS_FUNCTION decltype(auto) bad_access(std::index_sequence) const { + return v((Is * 1 + Is == 0 ? v.extent(Is) + 3 : 0)...); + } + + KOKKOS_FUNCTION void operator()(int) const { + ++bad_access(std::make_index_sequence{}); + } + + template + std::string get_details(std::index_sequence) { + std::stringstream ss; + ss << "with indices \\["; + ((ss << (Is == 0 ? v.extent(Is) + 3 : 0) + << (Is == View::rank() - 1 ? "\\]" : ",")), + ...); + ss << " but extents \\["; + ((ss << v.extent(Is) << (Is == View::rank() - 1 ? "\\]" : ",")), ...); + return ss.str(); + } + + auto get_details() { + return get_details(std::make_index_sequence()); + } + + TestViewOutOfBoundAccess(View w, ExecutionSpace const& s, std::string matcher) + : v(std::move(w)) { + constexpr bool view_accessible_from_execution_space = + Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/typename View::memory_space>::accessible; + EXPECT_TRUE(view_accessible_from_execution_space); + + matcher += ".*" + get_details(); + + EXPECT_DEATH( + { + Kokkos::parallel_for(Kokkos::RangePolicy(s, 0, 1), + *this); + Kokkos::fence(); + }, + matcher); + } +}; + +template +auto make_view_impl(LblOrPtr x, std::index_sequence) { + return View(x, (Is + 1)...); +} + +template +auto make_view(LblOrPtr x) { + return make_view_impl(std::move(x), + std::make_index_sequence()); +} + +template +void test_view_out_of_bounds_access() { + ExecutionSpace const exec_space{}; + // clang-format off + using V1 = Kokkos::View; + using V2 = Kokkos::View; + using V3 = Kokkos::View; + using V4 = Kokkos::View; + using V5 = Kokkos::View; + using V6 = Kokkos::View; + using V7 = Kokkos::View; + using V8 = Kokkos::View; + std::string const prefix = "Kokkos::View ERROR: out of bounds access"; + std::string const lbl = "my_label"; + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + int* const ptr = nullptr; + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + // clang-format on +} + +TEST(TEST_CATEGORY_DEATH, view_out_of_bounds_access) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (false && Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/Kokkos::HostSpace>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + +#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL + if (std::is_same_v) { + GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " + "is defined"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET + if (std::is_same_v) { + GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not " + "able to abort from the device"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same::value) { + GTEST_SKIP() << "skipping because OpenACC backend is currently not " + "able to abort from the device"; + } +#endif + + test_view_out_of_bounds_access(); +} + +#endif + +} // namespace From 8062a602070ce6d611b73c9eb626ebd7dc1a039b Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 11 Mar 2024 17:49:21 -0400 Subject: [PATCH 333/432] Fix linking with rothrust in downstream applications --- cmake/Modules/FindTPLROCTHRUST.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Modules/FindTPLROCTHRUST.cmake b/cmake/Modules/FindTPLROCTHRUST.cmake index ff75e3a3eeb..dae7dc3c952 100644 --- a/cmake/Modules/FindTPLROCTHRUST.cmake +++ b/cmake/Modules/FindTPLROCTHRUST.cmake @@ -12,4 +12,4 @@ FIND_PACKAGE(rocthrust REQUIRED) KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) # Export ROCTHRUST as a Kokkos dependency -KOKKOS_EXPORT_CMAKE_TPL(ROCTHRUST) +KOKKOS_EXPORT_CMAKE_TPL(rocthrust) From 5e7cab99b5789abe6a1b411134f41fd80bf1c740 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 12 Mar 2024 21:43:54 -0400 Subject: [PATCH 334/432] SYCL: Make sure to call find_dependency for oneDPL if necessary (#6870) * Make sure to call find_dependency for oneDPL if necessary * Move linking with oneDPL to algorithms * Don't guard KOKKOS_IMPORT_TPL by CMake language --- algorithms/src/CMakeLists.txt | 1 + cmake/Modules/FindTPLONEDPL.cmake | 3 +++ cmake/kokkos_tpls.cmake | 2 +- core/src/CMakeLists.txt | 1 - 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/algorithms/src/CMakeLists.txt b/algorithms/src/CMakeLists.txt index 7cb47a316b5..b490caca628 100644 --- a/algorithms/src/CMakeLists.txt +++ b/algorithms/src/CMakeLists.txt @@ -31,3 +31,4 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms ) KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) diff --git a/cmake/Modules/FindTPLONEDPL.cmake b/cmake/Modules/FindTPLONEDPL.cmake index 01791cff443..603510c315e 100644 --- a/cmake/Modules/FindTPLONEDPL.cmake +++ b/cmake/Modules/FindTPLONEDPL.cmake @@ -43,4 +43,7 @@ ELSE() COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) ENDIF() + + # Export oneDPL as a Kokkos dependency + KOKKOS_EXPORT_CMAKE_TPL(oneDPL) ENDIF() diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index c9ebcd6b94b..3998eaa9b21 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -82,8 +82,8 @@ IF (NOT WIN32) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_IMPORT_TPL(ROCM INTERFACE) - KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) ENDIF() +KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) KOKKOS_IMPORT_TPL(LIBQUADMATH) KOKKOS_IMPORT_TPL(ROCTHRUST) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 7b7c31a6baa..b84677e61b6 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -191,7 +191,6 @@ IF (NOT WIN32) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) ENDIF() # FIXME: We need a proper solution to figure out whether to enable From c1a800650e83da2eb515599f495f9bce2aaba80f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 13 Mar 2024 12:17:03 -0400 Subject: [PATCH 335/432] Don't use Fedora development version in GitHub CI --- .github/workflows/continuous-integration-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 6425cc2668e..5c7067c6686 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -20,7 +20,7 @@ jobs: continue-on-error: true strategy: matrix: - distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest'] + distro: ['fedora:latest', 'ubuntu:latest'] cxx: ['g++', 'clang++'] cxx_extra_flags: [''] cmake_build_type: ['Release', 'Debug'] From 49bd895ae27a53f06f72c7acf8a8f60043295ae1 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 13 Mar 2024 20:22:48 -0600 Subject: [PATCH 336/432] kokkos_tpls.cmake: update default option to enable rocthrust Avoid configuration issues with Trilinos of the form: Make Error at kokkos/cmake/kokkos_tpls.cmake:29 (MESSAGE): Enabled TPL ROCTHRUST inside TriBITS build, but this can only be enabled in a standalone build --- cmake/kokkos_tpls.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index 3998eaa9b21..6ef3b79bde2 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -39,8 +39,13 @@ IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT ELSE() SET(ROCM_DEFAULT OFF) ENDIF() +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) + SET(ROCTHRUST_DEFAULT ON) +ELSE() + SET(ROCTHRUST_DEFAULT OFF) +ENDIF() KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) -KOKKOS_TPL_OPTION(ROCTHRUST ${KOKKOS_ENABLE_HIP}) +KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) SET(ONEDPL_DEFAULT ON) From 096e72437cf93d78d7a85414f811de8c7f608b0e Mon Sep 17 00:00:00 2001 From: Thomas Conrad Clevenger Date: Thu, 14 Mar 2024 10:48:48 -0600 Subject: [PATCH 337/432] Scratch space fix for MultiGPU (#6866) * Use CudaSpace::(de)allocate() instead of Kokkos_SharedAllocate.hpp for Cuda scratch memory Fixes bug where all scratch mem was allocated on default device * Add scratch test to MultiGPU testing * fix typo in multi-gpu test Co-authored-by: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> * Use reductions for error accumulator * Remove exec_ arg from ScratchFunctor (not needed anymore) * avoid implicit capture of this in test --------- Co-authored-by: Dong Hun Lee <59181952+ldh4@users.noreply.github.com> --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 17 ++-- .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 97 +++++++++++++++++++ 2 files changed, 106 insertions(+), 8 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index f439f4fd0b7..849e8b3b30e 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -428,21 +428,21 @@ void *CudaInternal::resize_team_scratch_space(int scratch_pool_id, // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (m_team_scratch_current_size[scratch_pool_id] == 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc( - "Kokkos::CudaSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", bytes); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -459,8 +459,8 @@ void CudaInternal::finalize() { was_finalized = true; + auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); auto host_mem_space = Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); cuda_mem_space.deallocate(m_scratchFlags, @@ -476,7 +476,8 @@ void CudaInternal::finalize() { for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) - Kokkos::kokkos_free(m_team_scratch_ptr[i]); + cuda_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); } m_scratchSpaceCount = 0; diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp index 5fec4020921..d94735ceb23 100644 --- a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -168,4 +168,101 @@ TEST(cuda_multi_gpu, unmanaged_views) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p)); } } + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy::member_type &team, + int &error_accum) const { + Kokkos::View scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} + +TEST(cuda_multi_gpu, scratch_space) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + test_scratch(execs[0], execs[1]); + } +} } // namespace From 05bd485166c98372e254f4566978f12b9eef3eeb Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 14 Mar 2024 19:08:39 -0400 Subject: [PATCH 338/432] [ci skip] Bump version number to 4.3.99 --- CMakeLists.txt | 2 +- Makefile.kokkos | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b2168ff9ad..162579821e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,7 +150,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 2) +set(Kokkos_VERSION_MINOR 3) set(Kokkos_VERSION_PATCH 99) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") diff --git a/Makefile.kokkos b/Makefile.kokkos index 58931fc6917..b146b4a296d 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1,7 +1,7 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 2 +KOKKOS_VERSION_MINOR = 3 KOKKOS_VERSION_PATCH = 99 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) From 841b3a9f96eecb28bcc637ea6e5fbe63c9e3e92d Mon Sep 17 00:00:00 2001 From: Cedric Chevalier Date: Thu, 15 Feb 2024 18:35:34 +0100 Subject: [PATCH 339/432] Fix deep copy when filling Rank-7 views Typo in loop indices, from wrong copy-paste. --- core/src/Kokkos_CopyViews.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 08f6ba8d696..cab14b84025 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -223,8 +223,8 @@ struct ViewFill { : a(a_), val(val_) { Kokkos::parallel_for("Kokkos::ViewFill-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, - {a.extent(0), a.extent(1), a.extent(2), - a.extent(3), a.extent(5), a.extent(6)}), + {a.extent(0), a.extent(1), a.extent(3), + a.extent(4), a.extent(5), a.extent(6)}), *this); } From a2f2ba4049c89ee0085712ee15ba22160ae3baf3 Mon Sep 17 00:00:00 2001 From: Cedric Chevalier Date: Fri, 16 Feb 2024 11:58:34 +0100 Subject: [PATCH 340/432] TestViewCopy_c.hpp: add new unit test for deep copy (ViewFill) --- core/src/Kokkos_CopyViews.hpp | 8 + core/unit_test/CMakeLists.txt | 2 + core/unit_test/Makefile | 22 +- core/unit_test/TestViewCopy_c.hpp | 435 ++++++++++++++++++++++++++++++ 4 files changed, 456 insertions(+), 11 deletions(-) create mode 100644 core/unit_test/TestViewCopy_c.hpp diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index cab14b84025..bcb7b61d2cb 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -221,6 +221,8 @@ struct ViewFill { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewFill-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -249,6 +251,8 @@ struct ViewFill { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewFill-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -461,6 +465,8 @@ struct ViewCopy { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewCopy-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -494,6 +500,8 @@ struct ViewCopy { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewCopy-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 6dfb7505c5d..dd3f51af004 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -236,6 +236,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewAPI_e ViewCopy_a ViewCopy_b + ViewCopy_c ViewCtorDimMatch ViewEmptyRuntimeUnmanaged ViewHooks @@ -353,6 +354,7 @@ foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL ViewAPI_e ViewCopy_a ViewCopy_b + ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index 202809d3fc9..a4d65687e54 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -62,7 +62,7 @@ else STACK_TRACE_TERMINATE_FILTER := endif -TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize +TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ @@ -73,7 +73,7 @@ tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ ) \ ) -GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview +GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13 @@ -110,14 +110,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA += TestCuda_Init.o OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o OBJ_CUDA += TestCuda_RangePolicy.o TestCuda_RangePolicyRequire.o - OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o + OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o TestCuda_ViewCopy_c.o OBJ_CUDA += TestCuda_DeepCopyAlignment.o OBJ_CUDA += TestCuda_ViewMapping_a.o TestCuda_ViewMapping_b.o TestCuda_ViewMapping_subview.o TestCuda_ViewResize.o TestCuda_ViewLayoutStrideAssignment.o OBJ_CUDA += TestCudaUVM_ViewAPI_a.o TestCudaUVM_ViewAPI_b.o TestCudaUVM_ViewAPI_c.o TestCudaUVM_ViewAPI_d.o TestCudaUVM_ViewAPI_e.o - OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o + OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o TestCudaUVM_ViewCopy_c.o OBJ_CUDA += TestCudaUVM_ViewMapping_a.o TestCudaUVM_ViewMapping_b.o TestCudaUVM_ViewMapping_subview.o OBJ_CUDA += TestCudaHostPinned_ViewAPI_a.o TestCudaHostPinned_ViewAPI_b.o TestCudaHostPinned_ViewAPI_c.o TestCudaHostPinned_ViewAPI_d.o TestCudaHostPinned_ViewAPI_e.o - OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o + OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o TestCudaHostPinned_ViewCopy_c.o OBJ_CUDA += TestCudaHostPinned_ViewMapping_a.o TestCudaHostPinned_ViewMapping_b.o TestCudaHostPinned_ViewMapping_subview.o OBJ_CUDA += TestCuda_View_64bit.o OBJ_CUDA += TestCuda_ViewOfClass.o @@ -162,7 +162,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) OBJ_THREADS += TestThreads_RangePolicy.o TestThreads_RangePolicyRequire.o OBJ_THREADS += TestThreads_View_64bit.o OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o TestThreads_ViewAPI_c.o TestThreads_ViewAPI_d.o TestThreads_ViewAPI_e.o - OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o + OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o TestThreads_ViewCopy_c.o OBJ_THREADS += TestThreads_DeepCopyAlignment.o OBJ_THREADS += TestThreads_ViewMapping_a.o TestThreads_ViewMapping_b.o TestThreads_ViewMapping_subview.o TestThreads_ViewResize.o TestThreads_ViewLayoutStrideAssignment.o OBJ_THREADS += TestThreads_ViewOfClass.o @@ -198,7 +198,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) OBJ_OPENMP += TestOpenMP_RangePolicy.o TestOpenMP_RangePolicyRequire.o OBJ_OPENMP += TestOpenMP_View_64bit.o OBJ_OPENMP += TestOpenMP_ViewAPI_a.o TestOpenMP_ViewAPI_b.o TestOpenMP_ViewAPI_c.o TestOpenMP_ViewAPI_d.o TestOpenMP_ViewAPI_e.o - OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o + OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o TestOpenMP_ViewCopy_c.o OBJ_OPENMP += TestOpenMP_ViewMapping_a.o TestOpenMP_ViewMapping_b.o TestOpenMP_ViewMapping_subview.o TestOpenMP_ViewResize.o TestOpenMP_ViewLayoutStrideAssignment.o OBJ_OPENMP += TestOpenMP_ViewOfClass.o OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o @@ -237,7 +237,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) #OBJ_OPENMPTARGET += TestOpenMPTarget_SharedAlloc.o OBJ_OPENMPTARGET += TestOpenMPTarget_RangePolicy.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_a.o TestOpenMPTarget_ViewAPI_b.o TestOpenMPTarget_ViewAPI_c.o TestOpenMPTarget_ViewAPI_d.o #Some commented out code - #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o TestOpenMPTarget_ViewCopy_c.o OBJ_OPENMPTARGET += TestOpenMPTarget_DeepCopyAlignment.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_a.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_b.o @@ -292,7 +292,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) OBJ_HIP += TestHIP_Memory_Requirements.o OBJ_HIP += TestHIP_ParallelScanRangePolicy.o OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o - OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o + OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o TestHIPHostPinned_ViewCopy_c.o OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o TARGETS += KokkosCore_UnitTest_HIP @@ -307,7 +307,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) OBJ_HPX += TestHPX_RangePolicy.o TestHPX_RangePolicyRequire.o OBJ_HPX += TestHPX_View_64bit.o OBJ_HPX += TestHPX_ViewAPI_a.o TestHPX_ViewAPI_b.o TestHPX_ViewAPI_c.o TestHPX_ViewAPI_d.o TestHPX_ViewAPI_e.o - OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o + OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o TestHPX_ViewCopy_c.o OBJ_HPX += TestHPX_ViewMapping_a.o TestHPX_ViewMapping_b.o TestHPX_ViewMapping_subview.o TestHPX_ViewResize.o OBJ_HPX += TestHPX_ViewOfClass.o OBJ_HPX += TestHPX_SubView_a.o TestHPX_SubView_b.o @@ -347,7 +347,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) OBJ_SERIAL += TestSerial_RangePolicy.o TestSerial_RangePolicyRequire.o OBJ_SERIAL += TestSerial_View_64bit.o OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o TestSerial_ViewAPI_c.o TestSerial_ViewAPI_d.o TestSerial_ViewAPI_e.o - OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o + OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o TestSerial_ViewCopy_c.o OBJ_SERIAL += TestSerial_ViewMapping_a.o TestSerial_ViewMapping_b.o TestSerial_ViewMapping_subview.o TestSerial_ViewResize.o TestSerial_ViewLayoutStrideAssignment.o OBJ_SERIAL += TestSerial_ViewOfClass.o OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o diff --git a/core/unit_test/TestViewCopy_c.hpp b/core/unit_test/TestViewCopy_c.hpp new file mode 100644 index 00000000000..2f34b35fb07 --- /dev/null +++ b/core/unit_test/TestViewCopy_c.hpp @@ -0,0 +1,435 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { +// Do not rely on deep_copy(0) as we want to test it! +template +void reset_view(const ExecSpace& space, ViewType& a, int magic) { + auto policy = Kokkos::RangePolicy(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + Kokkos::parallel_for( + "TestViewCopy::ResetView", policy, + KOKKOS_LAMBDA(int i) { a.data()[i] = magic; }); +} + +template +size_t compute_overall_sum(const ExecSpace& space, ViewType& a) { + auto policy = Kokkos::RangePolicy(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + typename ViewType::value_type sum = 0; + Kokkos::parallel_reduce( + "TestViewCopy::ComputeSum", policy, + KOKKOS_LAMBDA(int i, int& lcl_sum) { lcl_sum += a.data()[i]; }, sum); + + return static_cast(sum); +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 0>* = nullptr) { + auto policy = Kokkos::RangePolicy(space, 0, 1); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank0", policy, + KOKKOS_LAMBDA(int, bool& local_check) { local_check &= (a() == magic); }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 1>* = nullptr) { + auto policy = Kokkos::RangePolicy(space, 0, a.extent(0)); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank1", policy, + KOKKOS_LAMBDA(int i, bool& local_check) { + local_check &= (a(i) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 2>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0}, {a.extent(0), a.extent(1)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank2", policy, + KOKKOS_LAMBDA(int i0, int i1, bool& local_check) { + local_check &= (a(i0, i1) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 3>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank3", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, bool& local_check) { + local_check &= (a(i0, i1, i2) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 4>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank4", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, bool& local_check) { + local_check &= (a(i0, i1, i2, i3) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 5>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank5", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 6>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank6", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5) == magic); + }, + Kokkos::LAnd(all_elements_are_set)); + + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 7>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(6); ++outer) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank7", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, outer) == magic); + }, + Kokkos::LAnd(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + return all_elements_are_set; +} + +template +bool check_magic_value( + const ExecSpace& space, const Kokkos::View& a, int magic, + std::enable_if_t::rank == 8>* = nullptr) { + auto policy = Kokkos::MDRangePolicy, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(7); ++outer) { + for (size_t inner = 0; inner < a.extent(6); ++inner) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank8", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, inner, outer) == magic); + }, + Kokkos::LAnd(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + } + return all_elements_are_set; +} + +template +bool view_fill_test(const ExecSpace& space, ViewType& a, int magic) { + Kokkos::deep_copy(space, a, magic); +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + if constexpr (std::is_same_v) { + return true; + } +#endif // KOKKOS_ENABLE_OPENMPTARGET + return check_magic_value(space, a, magic); +} + +template +void run_test() { + int magic = 19; + + using ViewType = Kokkos::View; + // Create views with different lengths for each dimension + // We want to test if all loops are over the correct dimensions + // We use prime numbers to make sure that the strides are different + ViewType a_decreasing("a", 23, 19, 17, 13, 11, 7, 5, 3); + // We also test with increasing strides to catch more "out-of-bounds" errors + // within subviews. + ViewType a_increasing("a", 3, 5, 7, 11, 13, 17, 19, 23); + + using exec_space = typename Space::execution_space; + auto space = exec_space(); + + // Use subviews in the tests to have cases with different ranks and + // non-contiguous memory + // Tests have two parts: + // 1. Fill the subview with a magic value and check that all elements are set + // 2. Check if only the subview is set by summing all elements in the view and + // comparing to the subview size times the magic value + + // Rank 0 + { + auto sub_dec = Kokkos::subview(a_decreasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + static_cast(magic)); + + auto sub_inc = Kokkos::subview(a_increasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), + static_cast(magic)); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 1 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 2 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 3 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ( + compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 4 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * + sub_dec.extent(3) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 5 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 6 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 7 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 8 + { + auto sub_dec = Kokkos::subview( + a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview( + a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_right) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutRight; + run_test(); +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_left) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutLeft; + run_test(); +} + +} // namespace From ae4d0013dd0a7ae4b3cd036a3ae9f0eb9002bea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Mon, 11 Mar 2024 14:12:15 +0100 Subject: [PATCH 341/432] TestViewCopy_c.hpp: better handling for OpenMPTarget Disabling call to value checker as it is not supported by the backend. --- core/unit_test/TestViewCopy_c.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/core/unit_test/TestViewCopy_c.hpp b/core/unit_test/TestViewCopy_c.hpp index 2f34b35fb07..758af13c7df 100644 --- a/core/unit_test/TestViewCopy_c.hpp +++ b/core/unit_test/TestViewCopy_c.hpp @@ -237,12 +237,11 @@ template bool view_fill_test(const ExecSpace& space, ViewType& a, int magic) { Kokkos::deep_copy(space, a, magic); #if defined(KOKKOS_ENABLE_OPENMPTARGET) - if constexpr (std::is_same_v) { - return true; - } -#endif // KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET Does not work with Land reducer + return true; +#else // KOKKOS_ENABLE_OPENMPTARGET return check_magic_value(space, a, magic); +#endif // KOKKOS_ENABLE_OPENMPTARGET } template From 46354d25d4befe2f9f2fc4b46ea30e3121b3d64e Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 18 Mar 2024 10:32:55 -0400 Subject: [PATCH 342/432] Use builtin for atomic_fetch in the HIP backend Co-authored-by: Damien L-G --- .../desul/include/desul/atomics/Adapt_HIP.hpp | 77 ++++++++ .../include/desul/atomics/Fetch_Op_HIP.hpp | 165 +++++++++--------- 2 files changed, 163 insertions(+), 79 deletions(-) create mode 100644 tpls/desul/include/desul/atomics/Adapt_HIP.hpp diff --git a/tpls/desul/include/desul/atomics/Adapt_HIP.hpp b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp new file mode 100644 index 00000000000..0eab27fe989 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Adapt_HIP.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_ADAPT_HIP_HPP_ +#define DESUL_ATOMICS_ADAPT_HIP_HPP_ + +#include + +namespace desul { +namespace Impl { + +// FIXME same code as GCCMemoryOrder +template +struct HIPMemoryOrder; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELAXED; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQUIRE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_RELEASE; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_ACQ_REL; +}; + +template <> +struct HIPMemoryOrder { + static constexpr int value = __ATOMIC_SEQ_CST; +}; + +// __HIP_MEMORY_SCOPE_SYSTEM +// __HIP_MEMORY_SCOPE_AGENT +// __HIP_MEMORY_SCOPE_WORKGROUP +// __HIP_MEMORY_SCOPE_WAVEFRONT +// __HIP_MEMORY_SCOPE_SINGLETHREAD +template +struct HIPMemoryScope; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_WORKGROUP; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_AGENT; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +template <> +struct HIPMemoryScope { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp index e9c749809de..920722084d1 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp @@ -9,99 +9,106 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_FECH_OP_HIP_HPP_ #define DESUL_ATOMICS_FECH_OP_HIP_HPP_ +#include + namespace desul { namespace Impl { -// clang-format off -inline __device__ int device_atomic_fetch_add( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_add( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_add(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ float device_atomic_fetch_add( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ double device_atomic_fetch_add( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } - -inline __device__ int device_atomic_fetch_sub( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_sub( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_sub(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ float device_atomic_fetch_sub( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ double device_atomic_fetch_sub( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } - -inline __device__ int device_atomic_fetch_min( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_min( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_min(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } - -inline __device__ int device_atomic_fetch_max( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_max( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_max(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } - -inline __device__ int device_atomic_fetch_and( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_and( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_and(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } - -inline __device__ int device_atomic_fetch_or ( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned int device_atomic_fetch_or ( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_or (unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, T) \ + template \ + __device__ inline T device_atomic_fetch_##OP( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_##OP(ptr, \ + val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_xor( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_xor( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_xor(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, long long) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned long long) + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, float) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, double) + +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(add) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(min) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(max) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(and) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(or) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(add) +// atomic min/max gives the wrong results (tested with ROCm 6.0 on Frontier) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(min) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(max) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(T) \ + template \ + __device__ inline T device_atomic_fetch_sub( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -val, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_inc( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_inc( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1ull); } +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(float) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(double) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_SUB + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC(T) \ + template \ + __device__ inline T device_atomic_fetch_inc( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + 1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } \ + template \ + __device__ inline T device_atomic_fetch_dec( T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -1, \ + HIPMemoryOrder::value, \ + HIPMemoryScope::value); \ + } -inline __device__ int device_atomic_fetch_dec( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_dec( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1 ); } +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned long long) -inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } -// clang-format on +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MEMORY_SCOPE, MEMORY_SCOPE_STRING_LITERAL) \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ - __threadfence(); \ - TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ - __threadfence(); \ - return return_val; \ + __device__ inline unsigned int device_atomic_fetch_inc_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_inc32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } \ template \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + __device__ inline unsigned int device_atomic_fetch_dec_mod(unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_dec32( \ + ptr, val, HIPMemoryOrder::value, MEMORY_SCOPE_STRING_LITERAL); \ } -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) - -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, double) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeCore, "workgroup") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeDevice, "agent") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeNode, "") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeSystem, "") -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD } // namespace Impl } // namespace desul From 872dc422ff8b77616ac5884f26b83ced61328ee0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 22 Mar 2024 16:03:24 -0400 Subject: [PATCH 343/432] Fix Makefile.kokkos for Threads --- Makefile.targets | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.targets b/Makefile.targets index e6900a822a8..e8e429e0275 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -81,7 +81,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) From 2035e313d7a54f9e1572eb5f315249ea841fb258 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Mon, 25 Mar 2024 19:54:46 -0400 Subject: [PATCH 344/432] Fix a bug in Makefile when using AMD GPU architectures (#6892) * Fix bug in Makefile when using AMD GPU architectures * Fix indentation * Fix documentation of the architecture to match the code --- Makefile.kokkos | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index b146b4a296d..6b7fff0dd6e 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -13,7 +13,7 @@ KOKKOS_DEVICES ?= "Threads" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC KOKKOS_ARCH ?= "" @@ -400,13 +400,29 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) endif endif -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) + +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A) +endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) +endif # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) From 8d734b0267284c3232e715355eb2c690d305ae3f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 27 Mar 2024 23:08:05 -0400 Subject: [PATCH 345/432] Cuda: Fix configuring with CMake 3.28.4 (#6898) * Cuda: Fix configuring with CMake 3.29.0 * CMake 3.28.4 is also affected --- cmake/Modules/FindTPLCUDA.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/Modules/FindTPLCUDA.cmake b/cmake/Modules/FindTPLCUDA.cmake index 792c92c07e9..5a62c530fce 100644 --- a/cmake/Modules/FindTPLCUDA.cmake +++ b/cmake/Modules/FindTPLCUDA.cmake @@ -7,7 +7,8 @@ IF (NOT CUDAToolkit_ROOT) ENDIF() ENDIF() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +# FIXME CMake 3.28.4 creates more targets than we export +IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0" AND CMAKE_VERSION VERSION_LESS "3.28.4") find_package(CUDAToolkit) ELSE() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) From 68c66846945c28e7b95ba099f2b4fbbe2b72fd63 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 28 Mar 2024 17:13:00 -0400 Subject: [PATCH 346/432] Update Intel GPU architectures in Makefile (#6895) * Update Intel GPU architectures in Makefile * Add some comments --- Makefile.kokkos | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 6b7fff0dd6e..73decf8b47a 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -15,7 +15,7 @@ KOKKOS_DEVICES ?= "Threads" # IBM: Power8,Power9 # AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 -# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC +# Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" @@ -318,12 +318,43 @@ KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL) KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX) KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR) -KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) +# Traditionally, we supported, e.g., IntelGen9 instead of Intel_Gen9. The latter +# matches the CMake option but we also accept the former for backward-compatibility. KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen11) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen12LP) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP)) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen) + endif +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_DG1) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_XeHP) +endif +# Traditionally the architecture was called PVC instead of Intel_PVC. This +# version makes us accept IntelPVC and Intel_PVC as well. KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC) # NVIDIA based. From e2cfdec5434e1b0e8ff9a47c081fcc49ad3f8550 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 28 Mar 2024 23:25:46 -0400 Subject: [PATCH 347/432] Drop Experimental::LayoutTiled class template --- core/src/Kokkos_Layout.hpp | 110 +- core/src/impl/Kokkos_ViewLayoutTiled.hpp | 1425 ------------------ core/unit_test/TestOther.hpp | 4 - core/unit_test/TestViewLayoutTiled.hpp | 1756 ---------------------- 4 files changed, 1 insertion(+), 3294 deletions(-) delete mode 100644 core/src/impl/Kokkos_ViewLayoutTiled.hpp delete mode 100644 core/unit_test/TestViewLayoutTiled.hpp diff --git a/core/src/Kokkos_Layout.hpp b/core/src/Kokkos_Layout.hpp index ca4d956784c..6c0a0c9cf1d 100644 --- a/core/src/Kokkos_Layout.hpp +++ b/core/src/Kokkos_Layout.hpp @@ -217,81 +217,9 @@ enum class Iterate { Right // Right indices stride fastest }; -// To check for LayoutTiled -// This is to hide extra compile-time 'identifier' info within the LayoutTiled -// class by not relying on template specialization to include the ArgN*'s -template +template struct is_layouttiled : std::false_type {}; -template -struct is_layouttiled> - : std::true_type {}; - -namespace Experimental { - -/// LayoutTiled -// Must have Rank >= 2 -template < - Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2 = 0, unsigned ArgN3 = 0, unsigned ArgN4 = 0, - unsigned ArgN5 = 0, unsigned ArgN6 = 0, unsigned ArgN7 = 0, - bool IsPowerOfTwo = - (Kokkos::Impl::is_integral_power_of_two(ArgN0) && - Kokkos::Impl::is_integral_power_of_two(ArgN1) && - (Kokkos::Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0)))> -struct LayoutTiled { - static_assert(IsPowerOfTwo, - "LayoutTiled must be given power-of-two tile dimensions"); - - using array_layout = LayoutTiled; - static constexpr Iterate outer_pattern = OuterP; - static constexpr Iterate inner_pattern = InnerP; - - enum { N0 = ArgN0 }; - enum { N1 = ArgN1 }; - enum { N2 = ArgN2 }; - enum { N3 = ArgN3 }; - enum { N4 = ArgN4 }; - enum { N5 = ArgN5 }; - enum { N6 = ArgN6 }; - enum { N7 = ArgN7 }; - - size_t dimension[ARRAY_LAYOUT_MAX_RANK]; - - enum : bool { is_extent_constructible = true }; - - LayoutTiled(LayoutTiled const&) = default; - LayoutTiled(LayoutTiled&&) = default; - LayoutTiled& operator=(LayoutTiled const&) = default; - LayoutTiled& operator=(LayoutTiled&&) = default; - - KOKKOS_INLINE_FUNCTION - explicit constexpr LayoutTiled(size_t argN0 = 0, size_t argN1 = 0, - size_t argN2 = 0, size_t argN3 = 0, - size_t argN4 = 0, size_t argN5 = 0, - size_t argN6 = 0, size_t argN7 = 0) - : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {} - - friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) { - for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) - if (left.dimension[rank] != right.dimension[rank]) return false; - return true; - } - - friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) { - return !(left == right); - } -}; - -} // namespace Experimental - // For use with view_copy template struct layout_iterate_type_selector { @@ -321,42 +249,6 @@ struct layout_iterate_type_selector { Kokkos::Iterate::Default; }; -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; - -template -struct layout_iterate_type_selector> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; - } // namespace Kokkos #endif // #ifndef KOKKOS_LAYOUT_HPP diff --git a/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/core/src/impl/Kokkos_ViewLayoutTiled.hpp deleted file mode 100644 index 957717f973d..00000000000 --- a/core/src/impl/Kokkos_ViewLayoutTiled.hpp +++ /dev/null @@ -1,1425 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP -#define KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// View offset and mapping for tiled view's - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout> - : public std::true_type {}; - -template -struct is_array_layout< - Kokkos::Experimental::LayoutTiled> - : public std::true_type {}; - -template -struct is_array_layout_tiled : public std::false_type {}; - -template -struct is_array_layout_tiled> : public std::true_type { -}; // Last template parameter "true" meaning this currently only supports - // powers-of-two - -namespace Impl { - -template -struct ViewOffset< - Dimension, Layout, - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && - is_array_layout::value && - is_array_layout_tiled::value)>> { - public: - static constexpr Kokkos::Iterate outer_pattern = Layout::outer_pattern; - static constexpr Kokkos::Iterate inner_pattern = Layout::inner_pattern; - - static constexpr int VORank = Dimension::rank; - - static constexpr unsigned SHIFT_0 = - Kokkos::Impl::integral_power_of_two(Layout::N0); - static constexpr unsigned SHIFT_1 = - Kokkos::Impl::integral_power_of_two(Layout::N1); - static constexpr unsigned SHIFT_2 = - Kokkos::Impl::integral_power_of_two(Layout::N2); - static constexpr unsigned SHIFT_3 = - Kokkos::Impl::integral_power_of_two(Layout::N3); - static constexpr unsigned SHIFT_4 = - Kokkos::Impl::integral_power_of_two(Layout::N4); - static constexpr unsigned SHIFT_5 = - Kokkos::Impl::integral_power_of_two(Layout::N5); - static constexpr unsigned SHIFT_6 = - Kokkos::Impl::integral_power_of_two(Layout::N6); - static constexpr unsigned SHIFT_7 = - Kokkos::Impl::integral_power_of_two(Layout::N7); - static constexpr int MASK_0 = Layout::N0 - 1; - static constexpr int MASK_1 = Layout::N1 - 1; - static constexpr int MASK_2 = Layout::N2 - 1; - static constexpr int MASK_3 = Layout::N3 - 1; - static constexpr int MASK_4 = Layout::N4 - 1; - static constexpr int MASK_5 = Layout::N5 - 1; - static constexpr int MASK_6 = Layout::N6 - 1; - static constexpr int MASK_7 = Layout::N7 - 1; - - static constexpr unsigned SHIFT_2T = SHIFT_0 + SHIFT_1; - static constexpr unsigned SHIFT_3T = SHIFT_0 + SHIFT_1 + SHIFT_2; - static constexpr unsigned SHIFT_4T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3; - static constexpr unsigned SHIFT_5T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4; - static constexpr unsigned SHIFT_6T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5; - static constexpr unsigned SHIFT_7T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 + SHIFT_6; - static constexpr unsigned SHIFT_8T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6 + SHIFT_7; - - // Is an irregular layout that does not have uniform striding for each index. - using is_mapping_plugin = std::true_type; - using is_regular = std::false_type; - - using size_type = size_t; - using dimension_type = Dimension; - using array_layout = Layout; - - dimension_type m_dim; - size_type m_tile_N0; // Num tiles dim 0 - size_type m_tile_N1; - size_type m_tile_N2; - size_type m_tile_N3; - size_type m_tile_N4; - size_type m_tile_N5; - size_type m_tile_N6; - size_type m_tile_N7; - - //---------------------------------------- - -#define KOKKOS_IMPL_DEBUG_OUTPUT_CHECK 0 - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, - I1 const& i1) const { - auto tile_offset = - (outer_pattern == (Kokkos::Iterate::Left)) - ? (((i0 >> SHIFT_0) + m_tile_N0 * ((i1 >> SHIFT_1))) << SHIFT_2T) - : (((m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1))) << SHIFT_2T); - // ( num_tiles[1] * ti0 + ti1 ) * FTD - - auto local_offset = (inner_pattern == (Kokkos::Iterate::Left)) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0)) - : (((i0 & MASK_0) << SHIFT_1) + (i1 & MASK_1)); - // ( tile_dim[1] * li0 + li1 ) - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << "locali0 = " << (i0 & MASK_0) - << "\nlocali1 = " << (i1 & MASK_1) << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + m_tile_N1 * (i2 >> SHIFT_2))) - << SHIFT_3T) - : ((m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) - << SHIFT_3T); - - auto local_offset = (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1))) - : (((i0 & MASK_0) << (SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_2)) + (i2 & MASK_2)); - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 << " i2 = " << i2 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << " tilei2 = " << (i2 >> SHIFT_2) - << "\nlocali0 = " << (i0 & MASK_0) - << "locali1 = " << (i1 & MASK_1) << "locali2 = " << (i2 & MASK_2) - << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, - I3 const& i3) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * (i3 >> SHIFT_3)))) - << SHIFT_4T) - : ((m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) - << SHIFT_4T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2))) - : (((i0 & MASK_0) << (SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_3)) + (i3 & MASK_3)); - - return tile_offset + local_offset; - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * ((i3 >> SHIFT_3) + - m_tile_N3 * (i4 >> SHIFT_4))))) - << SHIFT_5T) - : ((m_tile_N4 * - (m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) - << SHIFT_5T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3))) - : (((i0 & MASK_0) << (SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_4)) + (i4 & MASK_4)); - - return tile_offset + local_offset; - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, - I5 const& i5) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * ((i4 >> SHIFT_4) + - m_tile_N4 * (i5 >> SHIFT_5)))))) - << SHIFT_6T) - : ((m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) - << SHIFT_6T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4))) - : (((i0 & MASK_0) - << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_5)) + (i5 & MASK_5)); - - return tile_offset + local_offset; - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * (i6 >> SHIFT_6))))))) - << SHIFT_7T) - : ((m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) - << SHIFT_7T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5))) - : (((i0 & MASK_0) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + - SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) - << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_6)) + (i6 & MASK_6)); - - return tile_offset + local_offset; - } - - // Rank 8 - template - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6, - I7 const& i7) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * - ((i6 >> SHIFT_6) + - m_tile_N6 * (i7 >> SHIFT_7)))))))) - << SHIFT_8T) - : ((m_tile_N7 * - (m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * - (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) + - (i7 >> SHIFT_7)) - << SHIFT_8T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5)) + - ((i7 & MASK_7) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6))) - : (((i0 & MASK_0) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) - << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_7 + SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_7 + SHIFT_6)) + - ((i6 & MASK_6) << (SHIFT_7)) + (i7 & MASK_7)); - - return tile_offset + local_offset; - } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { - return array_layout((VORank > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (VORank > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (VORank > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (VORank > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (VORank > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (VORank > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (VORank > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (VORank > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); - } - - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { - return m_dim.N0; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { - return m_dim.N1; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { - return m_dim.N2; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { - return m_dim.N3; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { - return m_dim.N4; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { - return m_dim.N5; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { - return m_dim.N6; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { - return m_dim.N7; - } - - KOKKOS_INLINE_FUNCTION constexpr size_type size() const { - return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * - m_dim.N6 * m_dim.N7; - } - - // Strides are meaningless due to irregularity - KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 0; } - - // Stride with [ rank ] value is the total length - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 0; - if (0 < dimension_type::rank) { - s[1] = 0; - } - if (1 < dimension_type::rank) { - s[2] = 0; - } - if (2 < dimension_type::rank) { - s[3] = 0; - } - if (3 < dimension_type::rank) { - s[4] = 0; - } - if (4 < dimension_type::rank) { - s[5] = 0; - } - if (5 < dimension_type::rank) { - s[6] = 0; - } - if (6 < dimension_type::rank) { - s[7] = 0; - } - if (7 < dimension_type::rank) { - s[8] = 0; - } - } - - KOKKOS_INLINE_FUNCTION constexpr size_type span() const { - // Rank2: ( NumTile0 * ( NumTile1 ) ) * TileSize, etc - return (VORank == 2) - ? (m_tile_N0 * m_tile_N1) << SHIFT_2T - : (VORank == 3) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2) << SHIFT_3T - : (VORank == 4) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * m_tile_N3) - << SHIFT_4T - : (VORank == 5) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4) - << SHIFT_5T - : (VORank == 6) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4 * m_tile_N5) - << SHIFT_6T - : (VORank == 7) - ? (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6) - << SHIFT_7T - : (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6 * m_tile_N7) - << SHIFT_8T; - } - - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return true; - } - - //---------------------------------------- -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - return *this; - } -#else - KOKKOS_DEFAULTED_FUNCTION ~ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset(const ViewOffset&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset& operator=(const ViewOffset&) = default; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr ViewOffset( - std::integral_constant const&, - array_layout const arg_layout) - : m_dim(arg_layout.dimension[0], arg_layout.dimension[1], - arg_layout.dimension[2], arg_layout.dimension[3], - arg_layout.dimension[4], arg_layout.dimension[5], - arg_layout.dimension[6], arg_layout.dimension[7]), - m_tile_N0((arg_layout.dimension[0] + MASK_0) >> - SHIFT_0 /* number of tiles in first dimension */), - m_tile_N1((arg_layout.dimension[1] + MASK_1) >> SHIFT_1), - m_tile_N2((VORank > 2) ? (arg_layout.dimension[2] + MASK_2) >> SHIFT_2 - : 0), - m_tile_N3((VORank > 3) ? (arg_layout.dimension[3] + MASK_3) >> SHIFT_3 - : 0), - m_tile_N4((VORank > 4) ? (arg_layout.dimension[4] + MASK_4) >> SHIFT_4 - : 0), - m_tile_N5((VORank > 5) ? (arg_layout.dimension[5] + MASK_5) >> SHIFT_5 - : 0), - m_tile_N6((VORank > 6) ? (arg_layout.dimension[6] + MASK_6) >> SHIFT_6 - : 0), - m_tile_N7((VORank > 7) ? (arg_layout.dimension[7] + MASK_7) >> SHIFT_7 - : 0) {} -}; - -// FIXME Remove the out-of-class definitions when we require C++17 -#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE \ - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \ - is_array_layout::value && \ - is_array_layout_tiled::value)> -template -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::outer_pattern; -template -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::inner_pattern; -template -constexpr int - ViewOffset::VORank; -template -constexpr unsigned - ViewOffset::SHIFT_0; -template -constexpr unsigned - ViewOffset::SHIFT_1; -template -constexpr unsigned - ViewOffset::SHIFT_2; -template -constexpr unsigned - ViewOffset::SHIFT_3; -template -constexpr unsigned - ViewOffset::SHIFT_4; -template -constexpr unsigned - ViewOffset::SHIFT_5; -template -constexpr unsigned - ViewOffset::SHIFT_6; -template -constexpr unsigned - ViewOffset::SHIFT_7; -template -constexpr int - ViewOffset::MASK_0; -template -constexpr int - ViewOffset::MASK_1; -template -constexpr int - ViewOffset::MASK_2; -template -constexpr int - ViewOffset::MASK_3; -template -constexpr int - ViewOffset::MASK_4; -template -constexpr int - ViewOffset::MASK_5; -template -constexpr int - ViewOffset::MASK_6; -template -constexpr int - ViewOffset::MASK_7; -template -constexpr unsigned - ViewOffset::SHIFT_2T; -template -constexpr unsigned - ViewOffset::SHIFT_3T; -template -constexpr unsigned - ViewOffset::SHIFT_4T; -template -constexpr unsigned - ViewOffset::SHIFT_5T; -template -constexpr unsigned - ViewOffset::SHIFT_6T; -template -constexpr unsigned - ViewOffset::SHIFT_7T; -template -constexpr unsigned - ViewOffset::SHIFT_8T; -#undef KOKKOS_ITERATE_VIEW_OFFSET_ENABLE - -//---------------------------------------- - -// ViewMapping assign method needed in order to return a 'subview' tile as a -// proper View The outer iteration pattern determines the mapping of the pointer -// offset to the beginning of requested tile The inner iteration pattern is -// needed for the layout of the tile's View to be returned Rank 2 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left ? ((i_tile0 + src.m_impl_offset.m_tile_N0 * i_tile1) - << src_offset_type::SHIFT_2T) - : ((src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) - << src_offset_type::SHIFT_2T)) // offset to start - // of the tile - ), - dst_offset_type()); - } -}; - -// Rank 3 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * i_tile2)) - << src_offset_type::SHIFT_3T) - : ((src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) + - i_tile2) - << src_offset_type::SHIFT_3T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 4 -template -class ViewMapping< - std::enable_if_t<(N4 == 0 && N5 == 0 && N6 == 0 && N7 == 0)> // void - , - Kokkos::ViewTraits< - T****, - Kokkos::Experimental::LayoutTiled, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * - (i_tile2 + src.m_impl_offset.m_tile_N2 * - i_tile3))) - << src_offset_type::SHIFT_4T) - : ((src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) - << src_offset_type::SHIFT_4T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 5 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * i_tile4)))) - << src_offset_type::SHIFT_5T) - : ((src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) - << src_offset_type::SHIFT_5T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 6 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + src.m_impl_offset.m_tile_N4 * - i_tile5))))) - << src_offset_type::SHIFT_6T) - : ((src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) - << src_offset_type::SHIFT_6T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 7 -template -class ViewMapping // void - , - Kokkos::ViewTraits< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5, iType6> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - i_tile6)))))) - << src_offset_type::SHIFT_7T) - : ((src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) - << src_offset_type::SHIFT_7T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 8 -template -class ViewMapping< - std::enable_if_t<(N0 != 0 && N1 != 0 && N2 != 0 && N3 != 0 && N4 != 0 && - N5 != 0 && N6 != 0 && N7 != 0)> // void - , - Kokkos::ViewTraits< - T********, - Kokkos::Experimental::LayoutTiled, - P...>, - Kokkos::Experimental::LayoutTiled, - iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled; - using src_traits = Kokkos::ViewTraits; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t; - using traits = - Kokkos::ViewTraits; - using type = - Kokkos::View; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping& dst, const ViewMapping& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6, const iType7 i_tile7) { - using dst_map_type = ViewMapping; - using src_map_type = ViewMapping; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - (i_tile6 + - src.m_impl_offset.m_tile_N6 * - i_tile7))))))) - << src_offset_type::SHIFT_8T) - : ((src.m_impl_offset.m_tile_N7 * - (src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) + - i_tile7) - << src_offset_type::SHIFT_8T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------- - -namespace Kokkos { - -// Rank 2 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View(src, SrcLayout(), i_tile0, - i_tile1); -} - -// Rank 3 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2); -} - -// Rank 4 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3); -} - -// Rank 5 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4); -} - -// Rank 6 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5); -} - -// Rank 7 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6); -} - -// Rank 8 -template -KOKKOS_INLINE_FUNCTION - Kokkos::View, - P...> - tile_subview(const Kokkos::View< - T********, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6, const size_t i_tile7) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled; - - return Kokkos::View( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6, i_tile7); -} - -} /* namespace Kokkos */ -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIENTAL_VIEWLAYOUTTILE_HPP */ diff --git a/core/unit_test/TestOther.hpp b/core/unit_test/TestOther.hpp index fcf0353a88c..935ae2b5f58 100644 --- a/core/unit_test/TestOther.hpp +++ b/core/unit_test/TestOther.hpp @@ -21,8 +21,4 @@ #include #include -// with VS 16.11.3 and CUDA 11.4.2 getting cudafe stackoverflow crash -#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) -#include -#endif #endif diff --git a/core/unit_test/TestViewLayoutTiled.hpp b/core/unit_test/TestViewLayoutTiled.hpp deleted file mode 100644 index 67308212ee0..00000000000 --- a/core/unit_test/TestViewLayoutTiled.hpp +++ /dev/null @@ -1,1756 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include - -#include -#include - -#include -#include - -namespace Test { - -namespace { - -template -struct TestViewLayoutTiled { - using Scalar = double; - - static constexpr int T0 = 2; - static constexpr int T1 = 4; - static constexpr int T2 = 4; - static constexpr int T3 = 2; - static constexpr int T4 = 2; - static constexpr int T5 = 2; - static constexpr int T6 = 2; - static constexpr int T7 = 2; - - // Rank 2 - using LayoutLL_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_2D_2x4 = - Kokkos::Experimental::LayoutTiled; - - // Rank 3 - using LayoutLL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled; - - // Rank 4 - using LayoutLL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutRL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutLR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - using LayoutRR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled; - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_2d(const int, const int) { -#else - static void test_view_layout_tiled_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - // Test create_mirror_view, deep_copy - // Create LL View - { - using ViewType = - typename Kokkos::View; - ViewType v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } - - // Create RL View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = - typename Kokkos::View; - Kokkos::View v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_2d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_3d(const int, const int, const int) { -#else - static void test_view_layout_tiled_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Create LL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_3d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_4d(const int, const int, const int, - const int){ -#else - static void test_view_layout_tiled_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Create LL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View; - Kokkos::View dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_4d - - static void test_view_layout_tiled_subtile_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - - // Counter to check for errors at the end - long counter[4] = {0}; - - // Create LL View - { - Kokkos::View v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; - std::cout << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - } // end test_view_layout_tiled_subtile_2d - - static void test_view_layout_tiled_subtile_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_3d - - static void test_view_layout_tiled_subtile_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1 + - tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_4d - -}; // end TestViewLayoutTiled struct - -} // namespace - -TEST(TEST_CATEGORY, view_layouttiled) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled::test_view_layout_tiled_2d(4, 12); - TestViewLayoutTiled::test_view_layout_tiled_3d(4, 12, 16); - TestViewLayoutTiled::test_view_layout_tiled_4d(4, 12, 16, 12); -} -TEST(TEST_CATEGORY, view_layouttiled_subtile) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled::test_view_layout_tiled_subtile_2d(4, 12); - TestViewLayoutTiled::test_view_layout_tiled_subtile_3d(4, 12, - 16); - TestViewLayoutTiled::test_view_layout_tiled_subtile_4d( - 4, 12, 16, 12); -} -} // namespace Test - -#undef KOKKOS_IMPL_PUBLIC_INCLUDE From 51b98e1d76b198478b3975ff738a99272450c67e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 28 Mar 2024 23:26:50 -0400 Subject: [PATCH 348/432] Get rid of now unnecessary use of is_layouttiled trait --- containers/src/Kokkos_DynRankView.hpp | 3 +-- core/src/Kokkos_CopyViews.hpp | 26 ++++++++------------------ core/src/Kokkos_View.hpp | 6 ++---- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 5fa59f1b7cd..3989911aca4 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1657,8 +1657,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v || - is_layouttiled::value) { + std::is_same_v) { for (int i = N; i < 7; ++i) layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; } diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 08f6ba8d696..bb44d33ba21 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -539,11 +539,8 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same::value) { + if (std::is_same::value) { iterate = Kokkos::Iterate::Right; } else if (std::is_same::value) { @@ -630,11 +627,8 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same::value) { + if (std::is_same::value) { iterate = Kokkos::Iterate::Right; } else if (std::is_same::value) { @@ -3092,8 +3086,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value> + Kokkos::LayoutStride>::value> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3139,8 +3132,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value)> + Kokkos::LayoutStride>::value)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3308,8 +3300,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value> + Kokkos::LayoutStride>::value> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3351,8 +3342,7 @@ inline std::enable_if_t< std::is_same::array_layout, Kokkos::LayoutRight>::value || std::is_same::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled::array_layout>::value)> + Kokkos::LayoutStride>::value)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 484a0e6f62e..0124d31107d 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -1442,8 +1442,7 @@ class View : public ViewTraits { std::is_same_v || std::is_same_v || - is_layouttiled::value) { + Kokkos::LayoutStride>) { size_t i0 = arg_layout.dimension[0]; size_t i1 = arg_layout.dimension[1]; size_t i2 = arg_layout.dimension[2]; @@ -1495,8 +1494,7 @@ class View : public ViewTraits { std::is_same_v || std::is_same_v || - is_layouttiled::value) { + Kokkos::LayoutStride>) { size_t i0 = arg_layout.dimension[0]; size_t i1 = arg_layout.dimension[1]; size_t i2 = arg_layout.dimension[2]; From 1efeb5d76d677e68701cd9592d475ee56f5a1dbc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 28 Mar 2024 23:27:18 -0400 Subject: [PATCH 349/432] Deprecate is_layouttiled trait --- core/src/Kokkos_Layout.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Layout.hpp b/core/src/Kokkos_Layout.hpp index 6c0a0c9cf1d..92719b3ec39 100644 --- a/core/src/Kokkos_Layout.hpp +++ b/core/src/Kokkos_Layout.hpp @@ -217,8 +217,10 @@ enum class Iterate { Right // Right indices stride fastest }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template -struct is_layouttiled : std::false_type {}; +struct KOKKOS_DEPRECATED is_layouttiled : std::false_type {}; +#endif // For use with view_copy template From 635551058d436885bf3757eb0bd1ea17bbf0abd4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 1 Apr 2024 18:52:03 -0400 Subject: [PATCH 350/432] Move `Kokkos::Array` tests to a more suitable place (#6905) * Move Kokkos::Array tests to a more suitable place * Workaround bogous(?) compile error with Array::operator[] not being constexpr --- core/unit_test/TestAggregate.hpp | 36 ------------------------ core/unit_test/TestArray.cpp | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp index f1316a7426a..b4d924793c1 100644 --- a/core/unit_test/TestAggregate.hpp +++ b/core/unit_test/TestAggregate.hpp @@ -63,42 +63,6 @@ void TestViewAggregate() { ASSERT_EQ(y.extent(0), 4u); ASSERT_EQ(y.extent(1), 5u); ASSERT_EQ(y.extent(2), 32u); - - // Initialize arrays from brace-init-list as for std::array. - // - // Comment: Clang will issue the following warning if we don't use double - // braces here (one for initializing the Kokkos::Array and one for - // initializing the sub-aggreagate C-array data member), - // - // warning: suggest braces around initialization of subobject - // - // but single brace syntax would be valid as well. - Kokkos::Array aggregate_initialization_syntax_1 = {{1.41, 3.14}}; - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[0], 1.41); - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[1], 3.14); - - Kokkos::Array aggregate_initialization_syntax_2{ - {0, 1, 2}}; // since C++11 - for (int i = 0; i < 3; ++i) { - ASSERT_EQ(aggregate_initialization_syntax_2[i], i); - } - - // Note that this is a valid initialization. - Kokkos::Array initialized_with_one_argument_missing = {{255, 255}}; - for (int i = 0; i < 2; ++i) { - ASSERT_DOUBLE_EQ(initialized_with_one_argument_missing[i], 255); - } - // But the following line would not compile - // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; - - // The code below must compile for zero-sized arrays. - using T = float; - - constexpr int N = 0; - Kokkos::Array a; - for (int i = 0; i < N; ++i) { - a[i] = T(); - } } TEST(TEST_CATEGORY, view_aggregate) { TestViewAggregate(); } diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index 673d0036b71..e138a64d6db 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -73,4 +73,51 @@ KOKKOS_FUNCTION constexpr bool test_array_ctad() { static_assert(test_array_ctad()); #endif +KOKKOS_FUNCTION constexpr bool test_array_aggregate_initialization() { + // Initialize arrays from brace-init-list as for std::array. + + Kokkos::Array aggregate_initialization_syntax_1 = {1.41f, 3.14f}; + if ((aggregate_initialization_syntax_1[0] != 1.41f) || + (aggregate_initialization_syntax_1[1] != 3.14f)) + return false; + + Kokkos::Array aggregate_initialization_syntax_2{ + {0, 1, 2}}; // since C++11 + if ((aggregate_initialization_syntax_2[0] != 0) || + (aggregate_initialization_syntax_2[1] != 1) || + (aggregate_initialization_syntax_2[2] != 2)) + return false; + + // Note that this is a valid initialization. + Kokkos::Array initialized_with_one_argument_missing = {{255, 255}}; + if ((initialized_with_one_argument_missing[0] != 255) || + (initialized_with_one_argument_missing[1] != 255) || + (initialized_with_one_argument_missing[2] != 0)) + return false; + + // But the following line would not compile + // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; + + return true; +} + +static_assert(test_array_aggregate_initialization()); + +// A few compilers, such as GCC 8.4, were erroring out when the function below +// appeared in a constant expression because +// Kokkos::Array::operator[] is non-constexpr. The issue +// disappears with GCC 9.1 (https://godbolt.org/z/TG4TEef1b). As a workaround, +// the static_assert was dropped and the [[maybe_unused]] is used as an attempt +// to silent warnings that the function is never used. +[[maybe_unused]] KOKKOS_FUNCTION void test_array_zero_sized() { + using T = float; + + // The code below must compile for zero-sized arrays. + constexpr int N = 0; + Kokkos::Array a; + for (int i = 0; i < N; ++i) { + a[i] = T(); + } +} + } // namespace From 391e0408bed63b0ca0758da5dacb75a70677d0f7 Mon Sep 17 00:00:00 2001 From: Thomas Padioleau Date: Tue, 2 Apr 2024 11:19:17 +0200 Subject: [PATCH 351/432] Do not return a copy of the input functor for Kokkos::Experimental::for_each --- .../src/std_algorithms/Kokkos_ForEach.hpp | 56 +++++++++---------- .../impl/Kokkos_ForEachForEachN.hpp | 20 +++---- 2 files changed, 34 insertions(+), 42 deletions(-) diff --git a/algorithms/src/std_algorithms/Kokkos_ForEach.hpp b/algorithms/src/std_algorithms/Kokkos_ForEach.hpp index 6215b325afc..05969be463a 100644 --- a/algorithms/src/std_algorithms/Kokkos_ForEach.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ForEach.hpp @@ -29,49 +29,46 @@ namespace Experimental { template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_exespace_impl(label, ex, first, last, - std::move(functor)); +void for_each(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, UnaryFunctorType functor) { + Impl::for_each_exespace_impl(label, ex, first, last, std::move(functor)); } template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first, - IteratorType last, UnaryFunctorType functor) { - return Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", - ex, first, last, std::move(functor)); +void for_each(const ExecutionSpace& ex, IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", ex, + first, last, std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +void for_each(const std::string& label, const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), + std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +void for_each(const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, - KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, + KE::begin(v), KE::end(v), std::move(functor)); } // @@ -82,24 +79,23 @@ UnaryFunctorType for_each(const ExecutionSpace& ex, template , int> = 0> -KOKKOS_FUNCTION UnaryFunctorType for_each(const TeamHandleType& teamHandle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); } template , int> = 0> -KOKKOS_FUNCTION UnaryFunctorType -for_each(const TeamHandleType& teamHandle, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), + std::move(functor)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp index d3be3b7f667..99cc4a1cf3a 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp @@ -42,10 +42,9 @@ struct StdForEachFunctor { }; template -UnaryFunctorType for_each_exespace_impl(const std::string& label, - const HandleType& handle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { +void for_each_exespace_impl(const std::string& label, const HandleType& handle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(handle, first); Impl::expect_valid_range(first, last); @@ -56,8 +55,6 @@ UnaryFunctorType for_each_exespace_impl(const std::string& label, label, RangePolicy(handle, 0, num_elements), StdForEachFunctor(first, functor)); handle.fence("Kokkos::for_each: fence after operation"); - - return functor; } template -KOKKOS_FUNCTION UnaryFunctorType -for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, - IteratorType last, UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first); Impl::expect_valid_range(first, last); @@ -96,7 +93,6 @@ for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, TeamThreadRange(teamHandle, 0, num_elements), StdForEachFunctor(first, functor)); teamHandle.team_barrier(); - return functor; } template Date: Thu, 28 Mar 2024 22:37:37 -0400 Subject: [PATCH 352/432] Drop specialization of ViewMapping for Kokkos::Array --- core/src/Kokkos_View.hpp | 1 - core/src/impl/Kokkos_ViewArray.hpp | 622 ----------------------------- core/unit_test/TestAggregate.hpp | 72 ---- core/unit_test/TestOther.hpp | 1 - 4 files changed, 696 deletions(-) delete mode 100644 core/src/impl/Kokkos_ViewArray.hpp delete mode 100644 core/unit_test/TestAggregate.hpp diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 484a0e6f62e..d8bf5fc0a88 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -522,7 +522,6 @@ constexpr bool is_assignable(const Kokkos::View& dst, //---------------------------------------------------------------------------- #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/impl/Kokkos_ViewArray.hpp b/core/src/impl/Kokkos_ViewArray.hpp deleted file mode 100644 index fe43b630184..00000000000 --- a/core/src/impl/Kokkos_ViewArray.hpp +++ /dev/null @@ -1,622 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP -#define KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP - -#include - -namespace Kokkos { -namespace Impl { - -template -struct ViewDataAnalysis> { - private: - using array_analysis = ViewArrayAnalysis; - - static_assert(std::is_void

::value); - static_assert(std::is_same>::value); - static_assert(std::is_scalar::value, - "View of Array type must be of a scalar type"); - - public: - using specialize = Kokkos::Array<>; - - using dimension = typename array_analysis::dimension; - - private: - enum { - is_const = std::is_same::value - }; - - using array_scalar_dimension = typename dimension::template append::type; - - using scalar_type = std::conditional_t; - using non_const_scalar_type = V; - using const_scalar_type = const V; - - public: - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - using type = typename ViewDataType::type; - using const_type = typename ViewDataType::type; - using non_const_type = - typename ViewDataType::type; - - using scalar_array_type = - typename ViewDataType::type; - using const_scalar_array_type = - typename ViewDataType::type; - using non_const_scalar_array_type = - typename ViewDataType::type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -/** \brief View mapping for non-specialized data type and standard layout */ -template -class ViewMapping> { - private: - template - friend class ViewMapping; - template - friend class Kokkos::View; - - using offset_type = ViewOffset; - - using handle_type = typename Traits::value_type::pointer; - - handle_type m_impl_handle; - offset_type m_impl_offset; - size_t m_stride = 0; - - using scalar_type = typename Traits::value_type::value_type; - - using contiguous_reference = Kokkos::Array::contiguous>; - using strided_reference = - Kokkos::Array::strided>; - - enum { - is_contiguous_reference = - (Traits::rank == 0) || (std::is_same::value) - }; - - enum { Array_N = Traits::value_type::size() }; - enum { Array_S = is_contiguous_reference ? Array_N : 1 }; - - KOKKOS_INLINE_FUNCTION - ViewMapping(const handle_type &arg_handle, const offset_type &arg_offset) - : m_impl_handle(arg_handle), - m_impl_offset(arg_offset), - m_stride(is_contiguous_reference ? 0 : arg_offset.span()) {} - - public: - //---------------------------------------- - // Domain dimensions - - static constexpr unsigned Rank = Traits::dimension::rank; - - template - KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType &r) const { - return m_impl_offset.m_dim.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - using dim_type = typename offset_type::dimension_type; - return dim_type::static_extent(r); - } - - KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() - const { - return m_impl_offset.layout(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { - return m_impl_offset.dimension_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { - return m_impl_offset.dimension_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { - return m_impl_offset.dimension_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { - return m_impl_offset.dimension_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { - return m_impl_offset.dimension_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { - return m_impl_offset.dimension_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { - return m_impl_offset.dimension_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { - return m_impl_offset.dimension_7(); - } - - // Is a regular layout with uniform striding for each index. - using is_regular = typename offset_type::is_regular; - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_impl_offset.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_impl_offset.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_impl_offset.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_impl_offset.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_impl_offset.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_impl_offset.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_impl_offset.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_impl_offset.stride_7(); - } - - //---------------------------------------- - // Range span - - /** \brief Span of the mapped range */ - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { - return m_impl_offset.span() * Array_N; - } - - /** \brief Is the mapped range span contiguous */ - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_impl_offset.span_is_contiguous(); - } - - using reference_type = - std::conditional_t; - - using pointer_type = handle_type; - - /** \brief If data references are lvalue_reference than can query pointer to - * memory */ - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_impl_handle; - } - - //---------------------------------------- - // The View class performs all rank and bounds checking before - // calling these element reference methods. - - KOKKOS_FORCEINLINE_FUNCTION - reference_type reference() const { - return reference_type(m_impl_handle + 0, Array_N, 0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0) const { - return reference_type(m_impl_handle + m_impl_offset(i0) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1, i2) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2, - const I3 &i3, - const I4 &i4) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4) * Array_S, Array_N, - m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6) * Array_S, - Array_N, m_stride); - } - - template - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7) * Array_S, - Array_N, m_stride); - } - - //---------------------------------------- - - private: - enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ }; - enum { MemorySpanSize = sizeof(scalar_type) }; - - public: - /** \brief Span, in bytes, of the referenced memory */ - KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const { - return (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - } - - //---------------------------------------- - - KOKKOS_DEFAULTED_FUNCTION ViewMapping() = default; - - //---------------------------------------- - - template - KOKKOS_INLINE_FUNCTION ViewMapping(pointer_type ptr, Args... args) - : m_impl_handle(ptr), - m_impl_offset(std::integral_constant(), args...), - m_stride(m_impl_offset.span()) {} - - //---------------------------------------- - - template - Kokkos::Impl::SharedAllocationRecord<> *allocate_shared( - Kokkos::Impl::ViewCtorProp const &arg_prop, - typename Traits::array_layout const &arg_layout, - bool execution_space_specified) { - using alloc_prop = Kokkos::Impl::ViewCtorProp; - - using execution_space = typename alloc_prop::execution_space; - using memory_space = typename Traits::memory_space; - static_assert( - SpaceAccessibility::accessible); - using functor_type = - ViewValueFunctor; - using record_type = - Kokkos::Impl::SharedAllocationRecord; - - // Query the mapping for byte-size of allocation. - using padding = std::integral_constant< - unsigned int, alloc_prop::allow_padding ? sizeof(scalar_type) : 0>; - - m_impl_offset = offset_type(padding(), arg_layout); - - const size_t alloc_size = - (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - const auto &alloc_name = Impl::get_property(arg_prop); - const execution_space &exec_space = - Impl::get_property(arg_prop); - const memory_space &mem_space = - Impl::get_property(arg_prop); - - // Allocate memory from the memory space and create tracking record. - record_type *const record = - execution_space_specified - ? record_type::allocate(exec_space, mem_space, alloc_name, - alloc_size) - : record_type::allocate(mem_space, alloc_name, alloc_size); - - m_impl_handle = handle_type(reinterpret_cast(record->data())); - - functor_type functor = - execution_space_specified - ? functor_type(exec_space, (pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name) - : functor_type((pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name); - -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) - if (false) { - // Make sure the destroy functor gets instantiated. - // This avoids "cudaErrorInvalidDeviceFunction"-type errors. - functor.destroy_shared_allocation(); - } -#endif - - // Only initialize if the allocation is non-zero. - // May be zero if one of the dimensions is zero. - if constexpr (alloc_prop::initialize) - if (alloc_size) { - // Assume destruction is only required when construction is requested. - // The ViewValueFunctor has both value construction and destruction - // operators. - record->m_destroy = std::move(functor); - - // Construct values - record->m_destroy.construct_shared_allocation(); - } - - return record; - } -}; - -/** \brief Assign Array to non-Array */ - -template -class ViewMapping< - DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value) && - std::is_same>::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>> { - public: - // Can only convert to View::array_type - - enum { - is_assignable_data_type = - std::is_same::value && - (DstTraits::rank == SrcTraits::rank + 1) - }; - enum { - is_assignable = - std::is_same::value && - std::is_same::value - }; - - using TrackType = Kokkos::Impl::SharedAllocationTracker; - using DstType = ViewMapping; - using SrcType = ViewMapping>; - - KOKKOS_INLINE_FUNCTION - static void assign(DstType &dst, const SrcType &src, - const TrackType & /*src_track*/) { - static_assert(is_assignable, "Can only convert to array_type"); - - using dst_offset_type = typename DstType::offset_type; - - // Array dimension becomes the last dimension. - // Arguments beyond the destination rank are ignored. - if (src.span_is_contiguous()) { // not padded - dst.m_impl_offset = dst_offset_type( - std::integral_constant(), - typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } else { // is padded - using padded = std::integral_constant< - unsigned int, sizeof(typename SrcTraits::value_type::value_type)>; - - dst.m_impl_offset = dst_offset_type( - padded(), typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } - - dst.m_impl_handle = src.m_impl_handle; - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ViewMapping< - std::enable_if_t<( - std::is_same>::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, - SrcTraits, Args...> { - private: - static_assert(SrcTraits::rank == sizeof...(Args)); - - enum : bool { - R0 = is_integral_extent<0, Args...>::value, - R1 = is_integral_extent<1, Args...>::value, - R2 = is_integral_extent<2, Args...>::value, - R3 = is_integral_extent<3, Args...>::value, - R4 = is_integral_extent<4, Args...>::value, - R5 = is_integral_extent<5, Args...>::value, - R6 = is_integral_extent<6, Args...>::value, - R7 = is_integral_extent<7, Args...>::value - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) - }; - - // Whether right-most rank is a range. - enum { - R0_rev = - 0 == SrcTraits::rank - ? false - : (1 == SrcTraits::rank - ? R0 - : (2 == SrcTraits::rank - ? R1 - : (3 == SrcTraits::rank - ? R2 - : (4 == SrcTraits::rank - ? R3 - : (5 == SrcTraits::rank - ? R4 - : (6 == SrcTraits::rank - ? R5 - : (7 == SrcTraits::rank - ? R6 - : R7))))))) - }; - - // Subview's layout - using array_layout = - std::conditional_t<((rank == 0) || - (rank <= 2 && R0 && - std::is_same::value) || - (rank <= 2 && R0_rev && - std::is_same::value)), - typename SrcTraits::array_layout, - Kokkos::LayoutStride>; - - using value_type = typename SrcTraits::value_type; - - using data_type = std::conditional_t< - rank == 0, value_type, - std::conditional_t< - rank == 1, value_type *, - std::conditional_t< - rank == 2, value_type **, - std::conditional_t< - rank == 3, value_type ***, - std::conditional_t< - rank == 4, value_type ****, - std::conditional_t< - rank == 5, value_type *****, - std::conditional_t< - rank == 6, value_type ******, - std::conditional_t>>>>>>>; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - KOKKOS_INLINE_FUNCTION - static void assign(ViewMapping &dst, - ViewMapping const &src, Args... args) { - using DstType = ViewMapping; - - using dst_offset_type = typename DstType::offset_type; - using dst_handle_type = typename DstType::handle_type; - - const SubviewExtents extents(src.m_impl_offset.m_dim, - args...); - - dst.m_impl_offset = dst_offset_type(src.m_impl_offset, extents); - dst.m_impl_handle = dst_handle_type( - src.m_impl_handle + - src.m_impl_offset(extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6), extents.domain_offset(7))); - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP */ diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp deleted file mode 100644 index b4d924793c1..00000000000 --- a/core/unit_test/TestAggregate.hpp +++ /dev/null @@ -1,72 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef TEST_AGGREGATE_HPP -#define TEST_AGGREGATE_HPP - -#include - -namespace Test { - -template -void TestViewAggregate() { - using value_type = Kokkos::Array; - using analysis_1d = - Kokkos::Impl::ViewDataAnalysis; - - static_assert( - std::is_same >::value); - - using a32_traits = Kokkos::ViewTraits; - using flat_traits = - Kokkos::ViewTraits; - - static_assert( - std::is_same >::value); - static_assert( - std::is_same::value); - static_assert(a32_traits::rank == 2); - static_assert(a32_traits::rank_dynamic == 2); - - static_assert(std::is_void::value); - static_assert(flat_traits::rank == 3); - static_assert(flat_traits::rank_dynamic == 2); - static_assert(flat_traits::dimension::N2 == 32); - - using a32_type = Kokkos::View **, DeviceType>; - using a32_flat_type = typename a32_type::array_type; - - static_assert(std::is_same::value); - static_assert(std::is_same::value); - static_assert(a32_type::rank == 2); - static_assert(a32_flat_type::rank == 3); - - a32_type x("test", 4, 5); - a32_flat_type y(x); - - ASSERT_EQ(x.extent(0), 4u); - ASSERT_EQ(x.extent(1), 5u); - ASSERT_EQ(y.extent(0), 4u); - ASSERT_EQ(y.extent(1), 5u); - ASSERT_EQ(y.extent(2), 32u); -} - -TEST(TEST_CATEGORY, view_aggregate) { TestViewAggregate(); } - -} // namespace Test - -#endif /* #ifndef TEST_AGGREGATE_HPP */ diff --git a/core/unit_test/TestOther.hpp b/core/unit_test/TestOther.hpp index fcf0353a88c..2a5bf4a7166 100644 --- a/core/unit_test/TestOther.hpp +++ b/core/unit_test/TestOther.hpp @@ -16,7 +16,6 @@ #ifndef KOKKOS_TEST_OTHER_HPP #define KOKKOS_TEST_OTHER_HPP -#include #include #include From 059cd15c0b2c23443d2e870bcfe767611386157f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 2 Apr 2024 16:12:26 -0400 Subject: [PATCH 353/432] Accommodate users that depend on a code that define silly macros (#6909) * Accommodate users that depend on a code that define silly macros Reported on Slack, user was getting when including some other dependency ``` /include/Kokkos_Tuners.hpp(259): error: expected a "," or ">" template ^ ``` It turns out that dependency defines macros such as "X", "Y", "Z", or even "DIM" which is really asking for trouble. See here https://github.com/fluiddynsci/EngSketchPad/blob/1fe3fc4c68a759e0832c02f8d3c2bd8722f183a8/include/libCart3D/c3d_global.h#L24-L28 Since it does not take much change to accommodate that user, we avoid named parameters that collide with these silly macro defines. We are not promising that we support it but this should help. * Per review rename SZ -> ArraySize --- core/src/Kokkos_Tuners.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_Tuners.hpp b/core/src/Kokkos_Tuners.hpp index 618401654e7..f5ffc66af5b 100644 --- a/core/src/Kokkos_Tuners.hpp +++ b/core/src/Kokkos_Tuners.hpp @@ -256,13 +256,14 @@ auto get_point_helper(const PointType& in, const ArrayType& indices, template struct GetPoint; -template -struct GetPoint> { +template +struct GetPoint< + PointType, + std::array> { using index_set_type = - std::array; + std::array; static auto build(const PointType& in, const index_set_type& indices) { - return get_point_helper(in, indices, std::make_index_sequence{}); + return get_point_helper(in, indices, std::make_index_sequence{}); } }; From 2aecb1d2412eca0729f3bc3da9e10c215c17da78 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 2 Apr 2024 21:57:09 -0400 Subject: [PATCH 354/432] SYCL: Fix multi-GPU support and add test (#6887) * SYCL: Fix multi-GPU support and add test * Combine Cuda and SYCL test * Drop exception_handler --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 23 +-- core/unit_test/CMakeLists.txt | 10 +- core/unit_test/TestMultiGPU.hpp | 184 ++++++++++++++++++ .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 162 +-------------- .../sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp | 64 ++++++ 5 files changed, 269 insertions(+), 174 deletions(-) create mode 100644 core/unit_test/TestMultiGPU.hpp create mode 100644 core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 0e67adb5787..4a1c910c73d 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -171,21 +171,22 @@ sycl::device_ptr SYCLInternal::resize_team_scratch_space( // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - if (m_team_scratch_current_size[scratch_pool_id] == 0) { + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc( - "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -234,8 +235,8 @@ void SYCLInternal::finalize() { for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) { - Kokkos::kokkos_free( - m_team_scratch_ptr[i]); + device_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; } diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 6dfb7505c5d..a2350235318 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -902,15 +902,21 @@ if(Kokkos_ENABLE_SYCL) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_SYCLInterOpInit_Context SOURCES - UnitTestMainInit.cpp + UnitTestMainInit.cpp sycl/TestSYCL_InterOp_Init_Context.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_SYCLInterOpStreams SOURCES - UnitTestMain.cpp + UnitTestMain.cpp sycl/TestSYCL_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_SYCLInterOpStreamsMultiGPU + SOURCES + UnitTestMainInit.cpp + sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp + ) endif() SET(DEFAULT_DEVICE_SOURCES diff --git a/core/unit_test/TestMultiGPU.hpp b/core/unit_test/TestMultiGPU.hpp new file mode 100644 index 00000000000..aad2fa45f49 --- /dev/null +++ b/core/unit_test/TestMultiGPU.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, + TEST_EXECSPACE exec, Kokkos::View v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + exec.fence(); + exec0.fence(); + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::Range_0", + Kokkos::RangePolicy(exec0, 0, 100), + Test::FunctorRange(v0)); + Kokkos::parallel_for("Test::Range", + Kokkos::RangePolicy(exec, 0, 100), + Test::FunctorRange(v)); + exec.fence(); + exec0.fence(); + Kokkos::parallel_reduce( + "Test::RangeReduce_0", + Kokkos::RangePolicy>(exec0, + 0, 100), + Test::FunctorRangeReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::RangeReduce", + Kokkos::RangePolicy>(exec, 0, + 100), + Test::FunctorRangeReduce(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::MDRange_0", + Kokkos::MDRangePolicy>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange(v0)); + Kokkos::parallel_for("Test::MDRange", + Kokkos::MDRangePolicy>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange(v)); + Kokkos::parallel_reduce("Test::MDRangeReduce_0", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v0), sum0); + Kokkos::parallel_reduce("Test::MDRangeReduce", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::Team_0", + Kokkos::TeamPolicy(exec0, 10, 10), + Test::FunctorTeam(v0)); + Kokkos::parallel_for("Test::Team", + Kokkos::TeamPolicy(exec, 10, 10), + Test::FunctorTeam(v)); + Kokkos::parallel_reduce( + "Test::Team_0", + Kokkos::TeamPolicy>(exec0, + 10, 10), + Test::FunctorTeamReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::Team", + Kokkos::TeamPolicy>(exec, 10, + 10), + Test::FunctorTeamReduce(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy::member_type &team, + int &error_accum) const { + Kokkos::View scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} +} // namespace diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp index d94735ceb23..40955e9c7ca 100644 --- a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -15,7 +15,7 @@ //@HEADER #include -#include +#include namespace { @@ -57,79 +57,6 @@ std::array get_execution_spaces( return {exec0, exec1}; } -// Test Interoperability with Cuda Streams -void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, - TEST_EXECSPACE exec, Kokkos::View v) { - using MemorySpace = typename TEST_EXECSPACE::memory_space; - - Kokkos::deep_copy(exec, v, 5); - Kokkos::deep_copy(exec0, v0, 5); - - Kokkos::deep_copy(v, v0); - - int sum; - int sum0; - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0", - Kokkos::RangePolicy(exec0, 0, 100), - Test::FunctorRange(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", - Kokkos::RangePolicy(exec, 0, 100), - Test::FunctorRange(v)); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::RangeReduce_0", - Kokkos::RangePolicy>(exec0, - 0, 100), - Test::FunctorRangeReduce(v0), sum0); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::RangeReduce", - Kokkos::RangePolicy>(exec, 0, - 100), - Test::FunctorRangeReduce(v), sum); - ASSERT_EQ(600, sum0); - ASSERT_EQ(600, sum); - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0", - Kokkos::MDRangePolicy>( - exec0, {0, 0}, {10, 10}), - Test::FunctorMDRange(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", - Kokkos::MDRangePolicy>( - exec, {0, 0}, {10, 10}), - Test::FunctorMDRange(v)); - Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0", - Kokkos::MDRangePolicy, - Kokkos::LaunchBounds<128, 2>>( - exec0, {0, 0}, {10, 10}), - Test::FunctorMDRangeReduce(v0), sum0); - Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce", - Kokkos::MDRangePolicy, - Kokkos::LaunchBounds<128, 2>>( - exec, {0, 0}, {10, 10}), - Test::FunctorMDRangeReduce(v), sum); - ASSERT_EQ(700, sum0); - ASSERT_EQ(700, sum); - - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0", - Kokkos::TeamPolicy(exec0, 10, 10), - Test::FunctorTeam(v0)); - Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", - Kokkos::TeamPolicy(exec, 10, 10), - Test::FunctorTeam(v)); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::Team_0", - Kokkos::TeamPolicy>(exec0, - 10, 10), - Test::FunctorTeamReduce(v0), sum0); - Kokkos::parallel_reduce( - "Test::cuda::raw_cuda_stream::Team", - Kokkos::TeamPolicy>(exec, 10, - 10), - Test::FunctorTeamReduce(v), sum); - ASSERT_EQ(800, sum0); - ASSERT_EQ(800, sum); -} - TEST(cuda_multi_gpu, managed_views) { StreamsAndDevices streams_and_devices; { @@ -169,93 +96,6 @@ TEST(cuda_multi_gpu, unmanaged_views) { } } -struct ScratchFunctor { - int scratch_size; - int R; - - ScratchFunctor(int scratch_size_, int R_) - : scratch_size(scratch_size_), R(R_) {} - - KOKKOS_FUNCTION - void operator()(const Kokkos::TeamPolicy::member_type &team, - int &error_accum) const { - Kokkos::View scratch_mem( - team.team_scratch(1), scratch_size); - - // Initialize scratch memory - Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i) { scratch_mem(i) = 0; }); - team.team_barrier(); - - // Increment each entry in scratch memory R times - for (int r = 0; r < R; ++r) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i) { scratch_mem(i) += 1; }); - } - team.team_barrier(); - - // Check that each scratch entry has been incremented exactly R times - int team_error_accum; - auto R_loc = R; // avoid implicit capture of this - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(team, 0, scratch_size), - [&](int i, int &tsum) { - if (scratch_mem(i) != R_loc) { - tsum += 1; - } - }, - team_error_accum); - Kokkos::single(Kokkos::PerTeam(team), - [&]() { error_accum += team_error_accum; }); - } -}; - -void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { - constexpr int N = 10; - constexpr int R = 1000; - constexpr int scratch_size = 100; - using ScratchType = Kokkos::View; - - // Test allocating and using scratch space - ScratchFunctor f(scratch_size, R); - - auto policy0 = - Kokkos::TeamPolicy(exec0, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); - auto policy1 = - Kokkos::TeamPolicy(exec1, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); - - int error0, error1; - - Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); - Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); - ASSERT_EQ(error0, 0); - ASSERT_EQ(error1, 0); - - // Request larger scratch size to trigger a realloc and test - const auto new_scratch_size = scratch_size + 10; - ScratchFunctor f_more_scratch(new_scratch_size, R); - - auto policy0_more_scratch = - Kokkos::TeamPolicy(exec0, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); - auto policy1_more_scratch = - Kokkos::TeamPolicy(exec1, N, 10) - .set_scratch_size( - 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); - - Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, - f_more_scratch, error0); - Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, - f_more_scratch, error1); - ASSERT_EQ(error0, 0); - ASSERT_EQ(error1, 0); -} - TEST(cuda_multi_gpu, scratch_space) { StreamsAndDevices streams_and_devices; { diff --git a/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp b/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 00000000000..d3906e409f5 --- /dev/null +++ b/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +std::array get_execution_spaces() { + std::vector gpu_devices = + sycl::device::get_devices(sycl::info::device_type::gpu); + + TEST_EXECSPACE exec0( + sycl::queue{gpu_devices.front(), sycl::property::queue::in_order()}); + TEST_EXECSPACE exec1( + sycl::queue{gpu_devices.back(), sycl::property::queue::in_order()}); + + return {exec0, exec1}; +} + +TEST(sycl_multi_gpu, managed_views) { + std::array execs = get_execution_spaces(); + + Kokkos::View view0(Kokkos::view_alloc("v0", execs[0]), + 100); + Kokkos::View view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); +} + +TEST(sycl_multi_gpu, unmanaged_views) { + std::array execs = get_execution_spaces(); + + int *p0 = sycl::malloc_device(100, execs[0].sycl_queue()); + Kokkos::View view0(p0, 100); + + int *p1 = sycl::malloc_device(100, execs[1].sycl_queue()); + Kokkos::View view1(p1, 100); + + test_policies(execs[0], view0, execs[1], view1); + sycl::free(p0, execs[0].sycl_queue()); + sycl::free(p1, execs[1].sycl_queue()); +} + +TEST(sycl_multi_gpu, scratch_space) { + std::array execs = get_execution_spaces(); + + test_scratch(execs[0], execs[1]); +} +} // namespace From caa139c9b9307c88599bbe9e5e180b6db44692e4 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 3 Apr 2024 18:35:21 -0400 Subject: [PATCH 355/432] SYCL: Unroll shuffle loops for top-level parallel_reduce and parallel_scan (#6750) * SYCL: Unroll shuffle loops for top-level parallel_reduce and parallel_scan * Rename second lambda * Assert upper bounds and remove unattainable cases * Fix sign comparison warnings --- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 35 ++++++++++++++ .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 48 +++++++++++++++---- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 977b69bc9eb..3306efa957e 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -36,10 +36,25 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, const int sg_group_id = sg.get_group_id()[0]; const int id_in_sg = sg.get_local_id()[0]; +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < global_range) { + auto tmp = sg.shuffle_up(local_value, stride); + if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(global_range <= 32); +#else for (int stride = 1; stride < global_range; stride <<= 1) { auto tmp = sg.shuffle_up(local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } +#endif const int max_subgroup_size = sg.get_max_local_range()[0]; const int n_active_subgroups = @@ -61,6 +76,25 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, const auto upper_bound = std::min(local_range, n_active_subgroups - round * local_range); auto local_sg_value = local_mem[idx < n_active_subgroups ? idx : 0]; +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < upper_bound) { + auto tmp = sg.shuffle_up(local_sg_value, stride); + if (id_in_sg >= stride) { + if (idx < n_active_subgroups) + final_reducer.join(&local_sg_value, &tmp); + else + local_sg_value = tmp; + } + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(upper_bound <= 32); +#else for (int stride = 1; stride < upper_bound; stride <<= 1) { auto tmp = sg.shuffle_up(local_sg_value, stride); if (id_in_sg >= stride) { @@ -70,6 +104,7 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, local_sg_value = tmp; } } +#endif if (idx < n_active_subgroups) { local_mem[idx] = local_sg_value; if (round > 0) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index c308384af09..7069805a5b5 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -21,11 +21,9 @@ namespace Kokkos::Impl::SYCLReduction { -// FIXME_SYCL It appears that using shuffles is slower than going through local -// memory. template -inline constexpr bool use_shuffle_based_algorithm = false; -// std::is_reference_v; +inline constexpr bool use_shuffle_based_algorithm = + std::is_reference_v; template std::enable_if_t> workgroup_reduction( @@ -109,17 +107,31 @@ std::enable_if_t> workgroup_reduction( // Perform the actual workgroup reduction in each subgroup // separately. - auto sg = item.get_sub_group(); - const int id_in_sg = sg.get_local_id()[0]; - const auto local_range = - std::min(sg.get_local_range()[0], max_size); + auto sg = item.get_sub_group(); + const int id_in_sg = sg.get_local_id()[0]; + const int local_range = std::min(sg.get_local_range()[0], max_size); const auto upper_stride_bound = - std::min(local_range - id_in_sg, max_size - local_id); + std::min(local_range - id_in_sg, max_size - local_id); +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < local_range) { + auto tmp = sg.shuffle_down(local_value, stride); + if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { auto tmp = sg.shuffle_down(local_value, stride); if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); } +#endif // Copy the subgroup results into the first positions of the // reduction array. @@ -140,7 +152,7 @@ std::enable_if_t> workgroup_reduction( // the first subgroup, we first combine the items with a higher // index. if (n_active_subgroups > local_range) { - for (unsigned int offset = local_range; offset < n_active_subgroups; + for (int offset = local_range; offset < n_active_subgroups; offset += local_range) if (id_in_sg + offset < n_active_subgroups) { final_reducer.join(&sg_value, &local_mem[(id_in_sg + offset)]); @@ -149,11 +161,27 @@ std::enable_if_t> workgroup_reduction( } // Then, we proceed as before. +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < local_range) { + auto tmp = sg.shuffle_down(sg_value, stride); + if (id_in_sg + stride < n_active_subgroups) + final_reducer.join(&sg_value, &tmp); + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { auto tmp = sg.shuffle_down(sg_value, stride); if (id_in_sg + stride < n_active_subgroups) final_reducer.join(&sg_value, &tmp); } +#endif // Finally, we copy the workgroup results back to global memory // to be used in the next iteration. If this is the last From a833fb00ba79b36354fd45e3fbae1e1621e187d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Chevalier?= Date: Thu, 4 Apr 2024 13:53:12 +0200 Subject: [PATCH 356/432] Preparing readme for develop as the default branch (#6796) * Add 'Quick Start' section to README.md Added an extensive 'Quick Start' guide to the README.md document, which includes instructions for obtaining the latest release of Kokkos, building Kokkos for different platforms such as CPU on Linux or macOS, CUDA on Linux, and HIP on Linux, and using Kokkos in a CMake project. * Update hyperlinks in README.md The previous links pointing to kokkos.github.io have been changed to point correctly to kokkos.org. * Refine instructions for getting and building Kokkos in README.md The README.md file has been updated with simplified, clearer instructions. Changes include refined steps to obtain the latest Kokkos release using curl and to clone the development version of Kokkos from GitHub. * Add Kokkos installation guide using Spack in README.md * Reorder Readme and add links * Readme: Fix typo in cmake option * Fix typo in Readme Co-authored-by: Daniel Arndt * Readme: update version and hip instructions. * README: Remove building examples Adding one liner to get latest release * Fix a typo * dalg24 suggestions Co-authored-by: Damien L-G --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- README.md | 61 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 19793bb82d9..f4252437111 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![Kokkos](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) +[![Kokkos](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4)](https://kokkos.org) # Kokkos: Core Libraries @@ -10,43 +10,64 @@ hierarchies and multiple types of execution resources. It currently can use CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other backends in development. -**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.** +**Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** -For the complete documentation, click below: +## Learning about Kokkos -# [kokkos.github.io/kokkos-core-wiki](https://kokkos.github.io/kokkos-core-wiki) +To start learning about Kokkos: -# Learning about Kokkos +- [Kokkos Lectures](https://kokkos.org/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important capabilities. -To start learning about Kokkos: +- [Programming guide](https://kokkos.org/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. -- [Kokkos Lectures](https://kokkos.github.io/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities. +- [API reference](https://kokkos.org/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.org/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.org/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.org/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.org/kokkos-core-wiki/API/alphabetical.html). -- [Programming guide](https://kokkos.github.io/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. +- [Use cases and Examples](https://kokkos.org/kokkos-core-wiki/usecases.html): a serie of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -- [API reference](https://kokkos.github.io/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.github.io/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.github.io/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.github.io/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.github.io/kokkos-core-wiki/API/alphabetical.html). +## Obtaining Kokkos -- [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. +The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. +The current release is [4.2.01](https://github.com/kokkos/kokkos/releases/tag/4.2.01). -For non-public questions send an email to: *crtrott(at)sandia.gov* +```bash +curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +# Or with wget +wget https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +``` + +To clone the latest development version of Kokkos from GitHub: + +```bash +git clone -b develop https://github.com/kokkos/kokkos.git +``` + +### Building Kokkos -# Contributing to Kokkos +To build Kokkos, you will need to have a C++ compiler that supports C++14 or later. +All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). -Please see [this page](https://kokkos.github.io/kokkos-core-wiki/contributing.html) for details on how to contribute. +Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). -# Requirements, Building and Installing +You can also install Kokkos using [Spack](https://spack.io/): `spack install kokkos`. [Available configuration options](https://packages.spack.io/package.html?name=kokkos) can be displayed using `spack info kokkos`. + +## For the complete documentation: [kokkos.org/kokkos-core-wiki/](https://kokkos.org/kokkos-core-wiki/) + +## Support + +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. + +For non-public questions send an email to: *crtrott(at)sandia.gov* -All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.github.io/kokkos-core-wiki/requirements.html). +## Contributing -Building and installation instructions are described [here](https://kokkos.github.io/kokkos-core-wiki/building.html). +Please see [this page](https://kokkos.org/kokkos-core-wiki/contributing.html) for details on how to contribute. -# Citing Kokkos +## Citing Kokkos -Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citation.html). +Please see the [following page](https://kokkos.org/kokkos-core-wiki/citation.html). -# License +## License [![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html) From 497b438f1f16d63faa55240772b1f0d8681a7a89 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 4 Apr 2024 14:18:13 -0600 Subject: [PATCH 357/432] CHANGELOG.md: 4.3.00 update --- CHANGELOG.md | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1d06958295..f7b8af7695c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,105 @@ # CHANGELOG +## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) + +### Features: +* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782) +* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701) +* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704) +* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615) + +#### HIP: + * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857) + * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793) + +#### SYCL: +* We only support OneAPI SYCL implementation: add check during initialization + * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784) + * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784) +* Performance Improvements + * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739) + * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500) +* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739) +* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870) + +#### OpenMPTarget: +* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380) +* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585) +* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735) +* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652) + +#### OpenACC: +* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446) +* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) + +#### Threads: +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) + +#### OpenMP: +* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) + +### General Enhancements + +* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556) +* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598) +* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373) +* Provide new public headers `` and `` [\#6687](https://github.com/kokkos/kokkos/pull/6687) +* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747) +* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713) +* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243) +* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524) +* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813) +* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855) +* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850) +* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516) + +### Build System Changes +* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692) +* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773) +* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733) +* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606) +* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898) + +### Incompatibilities (i.e. breaking changes) +* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it [\#6523](https://github.com/kokkos/kokkos/pull/6523) +* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665) +* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690) +* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726) +* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754) +* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579) +* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593) +* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190) +* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642) +* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845) +* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861) +* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797) +* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557) +* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791) +* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798) +* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806) +* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744) + +### Deprecations +* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697) +* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710) +* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582) + +### Bug Fixes +* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511) +* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334) +* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667) +* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658) +* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777) +* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786) +* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821) +* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892) + ## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) From 4b90930999610dbc60bfdaddfe7ff1385a4fb997 Mon Sep 17 00:00:00 2001 From: Paul Zehner Date: Fri, 5 Apr 2024 05:34:08 +0200 Subject: [PATCH 358/432] Refactor: Uniformize `create_mirror*` parameter name for views (#6917) * Uniformize view name for create_mirror * Uniformize view name for create_mirror_view * Fix formatting --- core/src/Kokkos_CopyViews.hpp | 37 ++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index 08f6ba8d696..90a438a9df8 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3506,24 +3506,24 @@ auto create_mirror(const Kokkos::View& src, template std::enable_if_t::specialize>::value, typename Kokkos::View::HostMirror> -create_mirror(Kokkos::View const& v) { - return Impl::create_mirror(v, Impl::ViewCtorProp<>{}); +create_mirror(Kokkos::View const& src) { + return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } template std::enable_if_t::specialize>::value, typename Kokkos::View::HostMirror> create_mirror(Kokkos::Impl::WithoutInitializing_t wi, - Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(wi)); + Kokkos::View const& src) { + return Impl::create_mirror(src, view_alloc(wi)); } template ::value>> std::enable_if_t::specialize>::value, typename Impl::MirrorType::view_type> -create_mirror(Space const&, Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{})); +create_mirror(Space const&, Kokkos::View const& src) { + return Impl::create_mirror(src, view_alloc(typename Space::memory_space{})); } template ::specialize>::value && Impl::ViewCtorProp::has_memory_space>> auto create_mirror(Impl::ViewCtorProp const& arg_prop, - Kokkos::View const& v) { - return Impl::create_mirror(v, arg_prop); + Kokkos::View const& src) { + return Impl::create_mirror(src, arg_prop); } template @@ -3541,8 +3541,8 @@ std::enable_if_t< !Impl::ViewCtorProp::has_memory_space, typename Kokkos::View::HostMirror> create_mirror(Impl::ViewCtorProp const& arg_prop, - Kokkos::View const& v) { - return Impl::create_mirror(v, arg_prop); + Kokkos::View const& src) { + return Impl::create_mirror(src, arg_prop); } template ::specialize>::value, typename Impl::MirrorType::view_type> create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{}, wi)); + Kokkos::View const& src) { + return Impl::create_mirror(src, + view_alloc(typename Space::memory_space{}, wi)); } namespace Impl { @@ -3647,8 +3648,8 @@ create_mirror_view(const Kokkos::View& src) { template typename Kokkos::View::HostMirror create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Kokkos::View const& v) { - return Impl::create_mirror_view(v, view_alloc(wi)); + Kokkos::Impl::WithoutInitializing_t wi, Kokkos::View const& src) { + return Impl::create_mirror_view(src, view_alloc(wi)); } // FIXME_C++17 Improve SFINAE here. @@ -3675,15 +3676,15 @@ template ::value>> typename Impl::MirrorViewType::view_type create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View const& v) { + Kokkos::View const& src) { return Impl::create_mirror_view( - v, view_alloc(typename Space::memory_space{}, wi)); + src, view_alloc(typename Space::memory_space{}, wi)); } template auto create_mirror_view(const Impl::ViewCtorProp& arg_prop, - const Kokkos::View& v) { - return Impl::create_mirror_view(v, arg_prop); + const Kokkos::View& src) { + return Impl::create_mirror_view(src, arg_prop); } template From 98b1a38e5391c57b866c4b16bf8108fce56d827a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 5 Apr 2024 16:06:48 -0400 Subject: [PATCH 359/432] SYCL: Improve team_reduce implementation (#6562) * SYCL: Improve team_reduce implementation * Manually unroll loop to propagate shuffle destinations at compile-time * Update step_width * Fix sign comparison warnings * Comment and choosing step_width=16 * KOKKOS_ASSERT that the subgroup range doesn't exceed 32 * Remove unattainable cases * Add a barrier guarding the reduction array upon exiting team_reduce * Update comment to better reflect intent of barrier --------- Co-authored-by: Christian Trott --- core/src/SYCL/Kokkos_SYCL_Team.hpp | 77 ++++++++++++++---------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index dbba3827581..2b4c2be5227 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -133,72 +133,69 @@ class SYCLTeamMember { const unsigned int team_rank_ = team_rank(); // First combine the values in the same subgroup +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int shift) { + if (vector_range * shift < sub_group_range) { + const value_type tmp = sg.shuffle_down(value, vector_range * shift); + if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(sub_group_range <= 32); +#else for (unsigned int shift = 1; vector_range * shift < sub_group_range; shift <<= 1) { const value_type tmp = sg.shuffle_down(value, vector_range * shift); if (team_rank_ + shift < team_size_) reducer.join(value, tmp); } +#endif value = sg.shuffle(value, 0); - const auto n_subgroups = sg.get_group_range()[0]; + const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { reducer.reference() = value; return; } - // We need to chunk up the whole reduction because we might not have - // allocated enough memory. - const unsigned int maximum_work_range = - std::min(m_team_reduce_size / sizeof(value_type), n_subgroups); + // It was found experimentally that 16 is a good value for Intel PVC. + // Since there is a maximum number of 1024 threads with subgroup size 16, + // we have a maximum of 64 subgroups per workgroup which means 64/16=4 + // rounds for loading values into the reduction_array, and 16 redundant + // reduction steps executed by every thread. + constexpr int step_width = 16; + auto tmp_alloc = sycl::ext::oneapi::group_local_memory_for_overwrite< + value_type[step_width]>(m_item.get_group()); + auto& reduction_array = *tmp_alloc; const auto id_in_sg = sg.get_local_id()[0]; - auto reduction_array = - static_cast>(m_team_reduce); - // Load values into the first maximum_work_range values of the reduction + // Load values into the first step_width values of the reduction // array in chunks. This means that only sub groups with an id in the // corresponding chunk load values. - const auto group_id = sg.get_group_id()[0]; - if (id_in_sg == 0 && group_id < maximum_work_range) + const int group_id = sg.get_group_id()[0]; + if (id_in_sg == 0 && group_id < step_width) reduction_array[group_id] = value; sycl::group_barrier(m_item.get_group()); - for (unsigned int start = maximum_work_range; start < n_subgroups; - start += maximum_work_range) { + for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && - group_id < - std::min(start + maximum_work_range, n_subgroups)) + group_id < std::min(start + step_width, n_subgroups)) reducer.join(reduction_array[group_id - start], value); sycl::group_barrier(m_item.get_group()); } - // Let the first subgroup do the final reduction - if (group_id == 0) { - const auto local_range = sg.get_local_range()[0]; - auto result = - reduction_array[id_in_sg < maximum_work_range ? id_in_sg : 0]; - // In case the maximum_work_range is larger than the range of the first - // subgroup, we first combine the items with a higher index. - for (unsigned int offset = local_range; offset < maximum_work_range; - offset += local_range) - if (id_in_sg + offset < maximum_work_range) - reducer.join(result, reduction_array[id_in_sg + offset]); - sycl::group_barrier(sg); - - // Now do the actual subgroup reduction. - const auto min_range = - std::min(maximum_work_range, local_range); - for (unsigned int stride = 1; stride < min_range; stride <<= 1) { - const auto tmp = sg.shuffle_down(result, stride); - if (id_in_sg + stride < min_range) reducer.join(result, tmp); - } - if (id_in_sg == 0) reduction_array[0] = result; - } - sycl::group_barrier(m_item.get_group()); + // Do the final reduction for all threads redundantly + value = reduction_array[0]; + for (int i = 1; i < std::min(step_width, n_subgroups); ++i) + reducer.join(value, reduction_array[i]); - reducer.reference() = reduction_array[0]; - // Make sure that the reduction array hasn't been modified in the meantime. - m_item.barrier(sycl::access::fence_space::local_space); + reducer.reference() = value; + // Make sure that every thread is done using the reduction array. + sycl::group_barrier(m_item.get_group()); } //-------------------------------------------------------------------------- From 55c5757502bdc5d5f6b9658a91114dcf56df6a4b Mon Sep 17 00:00:00 2001 From: Thomas Conrad Clevenger Date: Mon, 8 Apr 2024 11:04:49 -0600 Subject: [PATCH 360/432] Use recommended/max team size functions in Cuda ParallelFor and Reduce constructors (#6891) * Use team_size_recommended in cuda ParallelFor constructor Previous computation was intended to match team_size_recommended, but missing extra scratch space allocation. * Same change for cuda ParallelReduce() * Remove unused attr variable * Use team_size_max() in pfor constructor instead of recomputing --- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 38 +++++---------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 9f7be45c839..71e77518210 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,17 +539,9 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes( - internal_space_instance->m_cudaDev); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, m_functor, m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -585,13 +577,7 @@ class ParallelFor, "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } - if (int(m_team_size) > - int(Kokkos::Impl::cuda_get_max_block_size( - internal_space_instance, attr, arg_functor, - arg_policy.impl_vector_length(), - arg_policy.team_scratch_size(0), - arg_policy.thread_scratch_size(0)) / - arg_policy.impl_vector_length())) { + if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } @@ -909,17 +895,11 @@ class ParallelReduce:: - get_cuda_func_attributes(internal_space_instance->m_cudaDev); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size( - internal_space_instance, attr, - m_functor_reducer.get_functor(), m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); m_team_begin = UseShflReduction From 8cf841076625376f34c516e557c1c66bcda97122 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 8 Apr 2024 14:16:12 -0400 Subject: [PATCH 361/432] SYCL: Fix range in subgroup scan for workgroup_scan --- core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 3306efa957e..58cfea6a97a 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -35,10 +35,11 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, auto sg = item.get_sub_group(); const int sg_group_id = sg.get_group_id()[0]; const int id_in_sg = sg.get_local_id()[0]; + const int local_range = std::min(sg.get_local_range()[0], global_range); #if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) auto shuffle_combine = [&](int stride) { - if (stride < global_range) { + if (stride < local_range) { auto tmp = sg.shuffle_up(local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } @@ -48,9 +49,9 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, shuffle_combine(4); shuffle_combine(8); shuffle_combine(16); - KOKKOS_ASSERT(global_range <= 32); + KOKKOS_ASSERT(local_range <= 32); #else - for (int stride = 1; stride < global_range; stride <<= 1) { + for (int stride = 1; stride < local_range; stride <<= 1) { auto tmp = sg.shuffle_up(local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } @@ -60,7 +61,6 @@ void workgroup_scan(sycl::nd_item item, const FunctorType& final_reducer, const int n_active_subgroups = (global_range + max_subgroup_size - 1) / max_subgroup_size; - const int local_range = sg.get_local_range()[0]; if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups) local_mem[sg_group_id] = local_value; local_value = sg.shuffle_up(local_value, 1); From 3a27cdbc276fd74f1e1860c939b6862936eb3bcb Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 10 Apr 2024 09:48:48 -0400 Subject: [PATCH 362/432] Add ROCm 6.0 in the nightly CI --- .jenkins_nightly | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/.jenkins_nightly b/.jenkins_nightly index 5d585817891..b213ba3061a 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -102,6 +102,39 @@ pipeline { ''' } } + stage('HIP-ROCM-6.0') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:6.0.2-complete' + label 'rocm-docker && AMD_Radeon_Instinct_MI210' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && ctest --verbose''' + } + post { + always { + sh 'ccache --show-stats' + } + } } } } From 164519d7d94cf9d260dd7834cef9fae979f93358 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 10 Apr 2024 14:20:00 -0600 Subject: [PATCH 363/432] MI300 support unified memory support (#6877) * Make Host backends be included before device backends There was an inclusion order issue when trying to enable unified memory, which was due to incomplete types. Effectively the Host backends must be defined before the device memory spaces for them to be marked accessible. We may have the same issue if we try to mark HostSpace accessible by device memory spaces - but I am not sure. * Add unified memory arch support for AMD This is intended for MI300A, but for now you have to set -DKokkos_ENABLE_IMPL_HIP_UNIFIED_MEMORY=ON This marks HIPSpace as host accessible, and thus will make create_mirror_view a no-op. * Fix two tests for AMD unified memory archs * Address review comments --- Makefile.kokkos | 32 ++++++++-------- .../src/sorting/impl/Kokkos_SortImpl.hpp | 5 +++ cmake/KokkosCore_config.h.in | 1 + cmake/kokkos_enable_devices.cmake | 37 ++++++++++--------- cmake/kokkos_enable_options.cmake | 1 + core/src/HIP/Kokkos_HIP.cpp | 4 ++ .../HIP/Kokkos_HIP_SharedAllocationRecord.cpp | 4 ++ .../HIP/Kokkos_HIP_SharedAllocationRecord.hpp | 4 ++ core/src/HIP/Kokkos_HIP_Space.hpp | 19 +++++++++- core/unit_test/hip/TestHIP_Spaces.cpp | 16 ++++++++ 10 files changed, 89 insertions(+), 34 deletions(-) diff --git a/Makefile.kokkos b/Makefile.kokkos index 73decf8b47a..f6028cb1418 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1253,6 +1253,22 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") + ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") @@ -1272,26 +1288,10 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) diff --git a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index 4c174b5fda9..08946228919 100644 --- a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -399,9 +399,14 @@ sort_device_view_with_comparator( using ViewType = Kokkos::View; using MemSpace = typename ViewType::memory_space; +// Note with HIP unified memory this code path is still the right thing to do +// if we end up here when RocThrust is not enabled. +// The create_mirror_view_and_copy will do the right thing (no copy). +#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY static_assert(!SpaceAccessibility::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); +#endif copy_to_host_run_stdsort_copy_back(exec, view, comparator); } diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 2df0f6c5205..3ab39cd6abf 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -39,6 +39,7 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK diff --git a/cmake/kokkos_enable_devices.cmake b/cmake/kokkos_enable_devices.cmake index 9a977520a3a..c7d189285c5 100644 --- a/cmake/kokkos_enable_devices.cmake +++ b/cmake/kokkos_enable_devices.cmake @@ -40,6 +40,26 @@ ELSE() ENDIF() KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") + +# We want this to default to OFF for cache reasons, but if no +# host space is given, then activate serial +IF (KOKKOS_HAS_TRILINOS) + #However, Trilinos always wants Serial ON + SET(SERIAL_DEFAULT ON) +ELSEIF (KOKKOS_HAS_HOST) + SET(SERIAL_DEFAULT OFF) +ELSE() + SET(SERIAL_DEFAULT ON) + IF (NOT DEFINED Kokkos_ENABLE_SERIAL) + MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") + ENDIF() +ENDIF() +KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") + +KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") + +# Device backends have to come after host backends for header include order reasons +# Without this we can't make e.g. CudaSpace accessible by HostSpace KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") IF (KOKKOS_ENABLE_OPENACC) COMPILER_SPECIFIC_FLAGS( @@ -90,23 +110,6 @@ IF (KOKKOS_ENABLE_CUDA) LIST(APPEND DEVICE_SETUP_LIST Cuda) ENDIF() -# We want this to default to OFF for cache reasons, but if no -# host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") - -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") - KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index a437f6132aa..32788e7aa0f 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -70,6 +70,7 @@ KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tu KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") +KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for HIP") # This option will go away eventually, but allows fallback to old implementation when needed. KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index 309e07fb3fb..aced2083ffb 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -146,6 +146,10 @@ void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { #else os << "no\n"; #endif +#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY + os << " KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY: "; + os << "yes\n"; +#endif os << "\nRuntime Configuration:\n"; diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index ab24004f5fc..83f829fddae 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -23,8 +23,12 @@ #include #include +#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPSpace); +#else +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HIPSpace); +#endif KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPHostPinnedSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index fbae5188344..1ca7bd5cd0e 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -20,8 +20,12 @@ #include #include +#if defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); +#else KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( Kokkos::HIPSpace); +#endif KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace); KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace); diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index 7f2004e5cbc..e1b4768b877 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -65,6 +65,18 @@ class HIPSpace { ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ +#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } +#else // FIXME_HIP Use execution space instance void* allocate(const HIP&, const size_t arg_alloc_size) const { return allocate(arg_alloc_size); @@ -74,6 +86,7 @@ class HIPSpace { const size_t arg_logical_size = 0) const { return allocate(arg_label, arg_alloc_size, arg_logical_size); } +#endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -267,7 +280,11 @@ static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); template <> struct MemorySpaceAccess { enum : bool { assignable = false }; - enum : bool { accessible = false }; +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) + enum : bool{accessible = false}; +#else + enum : bool { accessible = true }; +#endif enum : bool { deepcopy = true }; }; diff --git a/core/unit_test/hip/TestHIP_Spaces.cpp b/core/unit_test/hip/TestHIP_Spaces.cpp index 8f7499c244b..673c0f0fff3 100644 --- a/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/core/unit_test/hip/TestHIP_Spaces.cpp @@ -38,8 +38,13 @@ TEST(hip, space_access) { static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable); +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible); +#else + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); +#endif static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible); +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) static_assert(!Kokkos::SpaceAccessibility::accessible); +#else + static_assert(Kokkos::SpaceAccessibility::accessible); +#endif static_assert( Kokkos::SpaceAccessibility::accessible); +#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) static_assert(std::is_same::Space, Kokkos::HostSpace>::value); +#else + static_assert(std::is_same::Space, + Kokkos::Device>::value); +#endif static_assert( std::is_same::Space, From 6ea7be76ef65b8477bce566e715afb0453e61a6c Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Wed, 10 Apr 2024 23:19:08 +0200 Subject: [PATCH 364/432] cuda: reduction with `RangePolicy`: fix grid dimensions to work for large values and avoid overflow (#6578) Fixes issue #6578 and adds a test based on the bug report --- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 5 ++-- core/unit_test/TestReduce.hpp | 26 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 0f052be3c30..334834938a1 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -312,8 +312,9 @@ class ParallelReduce, // REQUIRED ( 1 , N , 1 ) dim3 block(1, block_size, 1); // Required grid.x <= block.y - dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1, - 1); + dim3 grid(std::min(index_type(block.y), + index_type((nwork + block.y - 1) / block.y)), + 1, 1); // TODO @graph We need to effectively insert this in to the graph const int shmem = diff --git a/core/unit_test/TestReduce.hpp b/core/unit_test/TestReduce.hpp index e1aa851f102..61b2bfb1505 100644 --- a/core/unit_test/TestReduce.hpp +++ b/core/unit_test/TestReduce.hpp @@ -625,4 +625,30 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) { } #endif #endif + +#if defined(NDEBUG) +// the following test was made for: +// https://github.com/kokkos/kokkos/issues/6517 + +struct FunctorReductionWithLargeIterationCount { + KOKKOS_FUNCTION void operator()(const int64_t /*i*/, double& update) const { + update += 1.0; + } +}; + +TEST(TEST_CATEGORY, reduction_with_large_iteration_count) { + if constexpr (std::is_same_v) { + GTEST_SKIP() << "Disabling for host backends"; + } + + const int64_t N = pow(2LL, 39LL) - pow(2LL, 8LL) + 1; + Kokkos::RangePolicy> p(0, N); + double nu = 0; + EXPECT_NO_THROW(Kokkos::parallel_reduce( + "sample reduction", p, FunctorReductionWithLargeIterationCount(), nu)); + ASSERT_DOUBLE_EQ(nu, double(N)); +} +#endif + } // namespace Test From 0099c10bebf3448d309a0a623a303307d94f6e02 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Thu, 11 Apr 2024 10:00:33 -0400 Subject: [PATCH 365/432] Fix nightly CI --- .jenkins_nightly | 1 + 1 file changed, 1 insertion(+) diff --git a/.jenkins_nightly b/.jenkins_nightly index b213ba3061a..a8facd365c2 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -134,6 +134,7 @@ pipeline { always { sh 'ccache --show-stats' } + } } } } From a2af4e0d4137fab559a6ea1698af0050a0cbdf03 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 11 Apr 2024 17:41:27 -0400 Subject: [PATCH 366/432] Deprecate trailing Proxy template argument in Kokkos::Array --- core/src/Kokkos_Array.hpp | 17 ++++++++++++++--- core/unit_test/TestArrayOps.hpp | 2 ++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index ba1626bb72e..461b98f6a72 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -80,7 +80,11 @@ struct ArrayBoundsCheck { /**\brief Derived from the C++17 'std::array'. * Dropping the iterator interface. */ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template +#else +template +#endif struct Array { public: /** @@ -131,8 +135,13 @@ struct Array { } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template struct Array { +#else +template +struct Array { +#endif public: using reference = T&; using const_reference = std::add_const_t&; @@ -178,14 +187,15 @@ struct Array { // Array & operator = ( Array && ) = default ; }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template <> -struct Array { +struct KOKKOS_DEPRECATED Array { struct contiguous {}; struct strided {}; }; template -struct Array::contiguous> { +struct KOKKOS_DEPRECATED Array::contiguous> { private: T* m_elem; size_t m_size; @@ -253,7 +263,7 @@ struct Array::contiguous> { }; template -struct Array::strided> { +struct KOKKOS_DEPRECATED Array::strided> { private: T* m_elem; size_t m_size; @@ -320,6 +330,7 @@ struct Array::strided> { size_type arg_stride) : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +#endif template Array(T, Us...)->Array; diff --git a/core/unit_test/TestArrayOps.hpp b/core/unit_test/TestArrayOps.hpp index 06528572714..387589fbe88 100644 --- a/core/unit_test/TestArrayOps.hpp +++ b/core/unit_test/TestArrayOps.hpp @@ -111,6 +111,7 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { ASSERT_EQ(ce.data(), nullptr); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -389,5 +390,6 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +#endif } // namespace From 92e02b50c428c8f62af56b9446f00e9cede758de Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 11 Apr 2024 18:03:11 -0700 Subject: [PATCH 367/432] CUDA: Update nvcc_wrapper --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index 9b935835d5f..9460ec60a84 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -233,7 +233,7 @@ do cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler) + --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler|--disable-warnings) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument From d88e2a5b0e7948c5315a605f5bc1ead00fc39fc6 Mon Sep 17 00:00:00 2001 From: Geoffroy Lesur Date: Fri, 12 Apr 2024 20:56:29 +0200 Subject: [PATCH 368/432] bring back --fmad option to nvcc_wrapper (#6931) * bring back --fmad option to nvcc_wrapper * Preserve support for flag with single leading dash --------- Co-authored-by: Damien L-G --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index 9460ec60a84..d58645f98ad 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|--fmad=*|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args From f2d37801dc88ac476cbc47054e726c6c433de2cc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 15 Apr 2024 17:26:15 -0400 Subject: [PATCH 369/432] Remove unnecessary header include Reported in https://github.com/kokkos/kokkos/pull/6934#pullrequestreview-2001860702 Co-authored-by: Nevin Liber --- core/src/Kokkos_Array.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 461b98f6a72..3d71d09fde1 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -29,7 +29,6 @@ #include #include #include -#include #include namespace Kokkos { From a8115e5df7d2fd68c215758abaf73bbe8598e0d2 Mon Sep 17 00:00:00 2001 From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com> Date: Tue, 16 Apr 2024 03:42:26 +0200 Subject: [PATCH 370/432] Adding converting constructor in Kokkos::RandomAccessIterator (#6929) * Adding converting constructor in Kokkos::RandomAccessIterator * fix constructible tests for Kokkos::RandomAccessIterator * fix converting constructor in Kokkos::RandomAccessIterator * Add comments to explain friend class of RandomAccessIterator is needed for converting constructor * Introduce KOKKOS_IMPL_CONDITIONAL_EXPLICIT macro from #6830 * Adding a conditional explicit in converting constructor of RandomAccessIterator * Rename ViewType to OtherViewType in converting constructor for readability * Replace tests with static_assert if they rely on compile time behaviour only * fix a condition for conditional explicit * Revert "Introduce KOKKOS_IMPL_CONDITIONAL_EXPLICIT macro from #6830" This reverts commit ee42c6d62e9b8373bd3494c79c97a8845593b325. * On second thought `KOKKOS_IMPL_CONDITIONAL_EXPLICIT` is not such a good idea because it let user write code that would compile with C++17 but not with later standards. --------- Co-authored-by: Yuuichi Asahi --- .../impl/Kokkos_RandomAccessIterator.hpp | 28 ++++++++++++++ .../unit_tests/TestRandomAccessIterator.cpp | 38 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5c9854b87d7..7bcc16a9b55 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -59,6 +59,30 @@ class RandomAccessIterator< ::Kokkos::View > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} +#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond + template + requires(std::is_constructible_v) KOKKOS_FUNCTION + explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#else + template < + class OtherViewType, + std::enable_if_t && + !std::is_convertible_v, + int> = 0> + KOKKOS_FUNCTION explicit RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} + + template , + int> = 0> + KOKKOS_FUNCTION RandomAccessIterator( + const RandomAccessIterator& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#endif + KOKKOS_FUNCTION iterator_type& operator++() { ++m_current_index; @@ -155,6 +179,10 @@ class RandomAccessIterator< ::Kokkos::View > { private: view_type m_view; ptrdiff_t m_current_index = 0; + + // Needed for the converting constructor accepting another iterator + template + friend class RandomAccessIterator; }; } // namespace Impl diff --git a/algorithms/unit_tests/TestRandomAccessIterator.cpp b/algorithms/unit_tests/TestRandomAccessIterator.cpp index 282d85548c5..7d484136b6d 100644 --- a/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -46,6 +46,44 @@ TEST_F(random_access_iterator_test, constructor) { EXPECT_TRUE(true); } +TEST_F(random_access_iterator_test, constructiblity) { + auto first_d = KE::begin(m_dynamic_view); + auto cfirst_d = KE::cbegin(m_dynamic_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_d) tmp_cfirst_d(first_d); + + auto first_s = KE::begin(m_static_view); + auto cfirst_s = KE::cbegin(m_static_view); + + static_assert(std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_s) tmp_cfirst_s(first_s); + + auto first_st = KE::begin(m_strided_view); + auto cfirst_st = KE::cbegin(m_strided_view); + + static_assert( + std::is_constructible_v); + static_assert( + !std::is_constructible_v); + [[maybe_unused]] decltype(cfirst_st) tmp_cfirst_st(first_st); + + // [FIXME] Better to have tests for the explicit specifier with an expression. + // As soon as View converting constructors are re-implemented with a + // conditional explicit, we may add those tests. + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + EXPECT_TRUE(true); +} + template void test_random_access_it_verify(IteratorType it, ValueType gold_value) { using view_t = Kokkos::View; From f94e8d34de523813f5e23e5622615566c80de8fc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 16 Apr 2024 14:46:49 -0400 Subject: [PATCH 371/432] Prefer standard C++ feature testing to guard the C++20 requires expression Temporary fix for our nightly builds so we can make decision on minimum CXX20 compiler requirements when we see fit. --- .../impl/Kokkos_RandomAccessIterator.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 7bcc16a9b55..ba0cdc91eea 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -59,7 +59,15 @@ class RandomAccessIterator< ::Kokkos::View > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} -#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond +// FIXME The C++20 requires expression is not supported with Clang 9 and GCC 9 +// The following guards is unsufficient until we increase our minimum CXX20 +// compiler requirements. +// #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond +// We replace the Kokkos guards with standard C++ feature testing in the +// meantime. +#if (defined(__cpp_concepts) && (__cpp_concepts >= 201907L)) && \ + (defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L)) template requires(std::is_constructible_v) KOKKOS_FUNCTION explicit(!std::is_convertible_v) From c9e21ce2ab8e03710494ed53a12c255df56fd3b1 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 16 Apr 2024 12:21:57 -0400 Subject: [PATCH 372/432] Add `kokkos_swap(Array)` sepcialization Specializing the swap algorithm for Kokkos arrays was initially proposed in #6697 but we dropped it to focus on the Kokkos swap ADL ordeal. Somehow we overlooked a stray header include in the Kokkos::Array header file. This PR reintroduce a `Kokkos::kokkos_swap(Kokkos::Array)` specialization, following closely what the standard library does for `std::swap(std::array)`. --- core/src/Kokkos_Array.hpp | 15 +++++++++++++++ core/unit_test/TestArray.cpp | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 461b98f6a72..7fd81030ecd 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -133,6 +133,17 @@ struct Array { KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { return &m_internal_implementation_private_member_data[0]; } + + private: + template + friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< + Impl::is_swappable::value> + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } + } }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -185,6 +196,10 @@ struct Array { // for default move constructor and move assignment operator. // Array( Array && ) = default ; // Array & operator = ( Array && ) = default ; + + private: + friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( + Array&, Array&) noexcept(Impl::is_nothrow_swappable_v) {} }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index e138a64d6db..e691d83ebe2 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -120,4 +120,37 @@ static_assert(test_array_aggregate_initialization()); } } +// User-defined type providing a sepcialization of kokkos_swap +struct MyInt { + int i; + + private: + friend constexpr void kokkos_swap(MyInt& lhs, MyInt& rhs) noexcept { + lhs.i = 255; + rhs.i = 127; + } +}; + +constexpr bool test_array_specialization_kokkos_swap() { + Kokkos::Array a{MyInt{1}, MyInt{2}}; + Kokkos::Array b{MyInt{11}, MyInt{22}}; + + // sanity check + if (a[0].i != 1 || a[1].i != 2 || b[0].i != 11 || b[1].i != 22) { + return false; + } + + using Kokkos::kokkos_swap; + kokkos_swap(a, b); + + // check that the user-definied kokkos_swap(MyInt) overload was called + if (a[0].i != 255 || a[1].i != 255 || b[0].i != 127 || b[1].i != 127) { + return false; + } + + return true; +} + +static_assert(test_array_specialization_kokkos_swap()); + } // namespace From 730d8d828f9d43b0cf3a1010b06c0008a7be128a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 Apr 2024 08:58:22 -0400 Subject: [PATCH 373/432] Deprecate specialization of Kokkos::pair for a single element This specialization is not documented, does not follow the standard library, it is not tested and has no known usage in Trilinos. `Kokkos::pair`, as we generally describe it, was intended as a drop-in replacement for `std::pair`. Hence, obscure departure from the standard implementation do not look like a good idea. This PR suggest to deprecate that `T2=void` specialization for degenerate pair that only hold one element. --- core/src/Kokkos_Pair.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 9be8d8d7aa1..d1bd11f7162 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -413,12 +413,13 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. // template -struct pair { +struct KOKKOS_DEPRECATED pair { using first_type = T1; using second_type = void; @@ -483,6 +484,7 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } +#endif namespace Impl { template From d914fe316ba64c6755abfd8e68cd7d1b872e04f7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 17 Apr 2024 10:05:24 -0400 Subject: [PATCH 374/432] Fix deprecated warning from `Kokkos::Array` specialization (#6945) * Fix deprecated warning from Kokkos::Array specialization The warnings come from the template arguments in deprecated specialization `Kokkos::Array<>::{contiguous,strided}` which refer to `Kokkos::Array<>` that is marked as deprecated. Minimal reproducer [here](https://godbolt.org/z/s18Txa5P6). GCC9 eats it but GCC10 onwards raise a warning. I propose the easy way out, that is we drop the `[[deprecated]]` attribute on `Kokkos::Array<>`. Let me know if you have a better idea. Sample warning from ArborX nightlies for completeness: ``` In file included from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/KokkosExp_MDRangePolicy.hpp:29, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Tuners.hpp:28, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/impl/Kokkos_Tools_Generic.hpp:26, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Parallel.hpp:34, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_MemoryPool.hpp:26, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_TaskScheduler.hpp:34, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Serial/Kokkos_Serial.hpp:37, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp:21, from /var/jenkins/workspace/ArborX_nightly/build-kokkos/KokkosCore_Config_DeclareBackend.hpp:22, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Core.hpp:45, from /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/impl/Kokkos_Core.cpp:21: /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:197:66: warning: 'Array' is deprecated [-Wdeprecated-declarations] 197 | struct KOKKOS_DEPRECATED Array::contiguous> { | ^~~~~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:191:26: note: declared here 191 | struct KOKKOS_DEPRECATED Array { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:265:66: warning: 'Array' is deprecated [-Wdeprecated-declarations] 265 | struct KOKKOS_DEPRECATED Array::strided> { | ^~~~~~~ /var/jenkins/workspace/ArborX_nightly/source-kokkos/core/src/Kokkos_Array.hpp:191:26: note: declared here 191 | struct KOKKOS_DEPRECATED Array { | | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``` * Revert "Fix deprecated warning from Kokkos::Array specialization" This reverts commit 38db1cab74df5fc547e779d6b9e3e65ebcb89a14. * Let Array<>::{contiguous,strided} be aliases to Impl:: tag classes Better approach to suppress the GCC deprecation warning suggested by Thomas on Slack. Co-Authored-By: Thomas Padioleau --------- Co-authored-by: Thomas Padioleau --- core/src/Kokkos_Array.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 3d71d09fde1..fed18d73fef 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -187,14 +187,20 @@ struct Array { }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +struct KokkosArrayContiguous {}; +struct KokkosArrayStrided {}; +} // namespace Impl + template <> struct KOKKOS_DEPRECATED Array { - struct contiguous {}; - struct strided {}; + using contiguous = Impl::KokkosArrayContiguous; + using strided = Impl::KokkosArrayStrided; }; template -struct KOKKOS_DEPRECATED Array::contiguous> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; @@ -262,7 +268,8 @@ struct KOKKOS_DEPRECATED Array::contiguous> { }; template -struct KOKKOS_DEPRECATED Array::strided> { +struct KOKKOS_DEPRECATED + Array { private: T* m_elem; size_t m_size; From 69c527a4245f495ae7d03c2bf4fcd7dd4364d0a7 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 17 Apr 2024 13:25:04 -0400 Subject: [PATCH 375/432] [ci skip] Enable deprecated code and deprecated warnings in nightly CI --- .jenkins_nightly | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.jenkins_nightly b/.jenkins_nightly index a8facd365c2..b723f12c0fc 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -95,7 +95,8 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_SERIAL=ON \ .. && \ make -j8 && ctest --verbose @@ -123,7 +124,7 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ From e7b486ff614abb6454e1172098888f8de15f7b65 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 17 Apr 2024 18:10:26 -0400 Subject: [PATCH 376/432] Serial: Use the provided execution space instance in TeamPolicy --- core/src/Serial/Kokkos_Serial_Parallel_Team.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index f34a7daaca0..a25b51496ef 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -37,6 +37,8 @@ class TeamPolicyInternal int m_league_size; int m_chunk_size; + Kokkos::Serial m_space; + public: //! Tag this class as a kokkos execution policy using execution_policy = TeamPolicyInternal; @@ -46,10 +48,7 @@ class TeamPolicyInternal //! Execution space of this execution policy: using execution_space = Kokkos::Serial; - const typename traits::execution_space& space() const { - static typename traits::execution_space m_space; - return m_space; - } + const typename traits::execution_space& space() const { return m_space; } template friend class TeamPolicyInternal; @@ -116,12 +115,13 @@ class TeamPolicyInternal return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024); } /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space&, int league_size_request, + TeamPolicyInternal(const execution_space& space, int league_size_request, int team_size_request, int /* vector_length_request */ = 1) : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_league_size(league_size_request), - m_chunk_size(32) { + m_chunk_size(32), + m_space(space) { if (team_size_request > 1) Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); } From 0859ab0af9b44315832cd27353bd3acf188853a3 Mon Sep 17 00:00:00 2001 From: "Nevin \":-)\" Liber" Date: Wed, 17 Apr 2024 17:31:45 -0500 Subject: [PATCH 377/432] Fixed the link for P6601 (Threads backend change) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7b8af7695c..c70ee5505f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ * Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) #### Threads: -* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6601) #### OpenMP: * Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) From 34d0db2f41dc11c9d30c3ff3449cddfc366c7e3d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 18 Apr 2024 07:54:59 -0400 Subject: [PATCH 378/432] Add test --- core/unit_test/TestExecSpacePartitioning.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 65314d6be7c..8703cc3b273 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -28,6 +28,17 @@ struct SumFunctor { void operator()(int i, int& lsum) const { lsum += i; } }; +template +void check_space_member_for_policies(const ExecSpace& exec) { + Kokkos::RangePolicy range_policy(exec, 0, 1); + ASSERT_EQ(range_policy.space(), exec); + Kokkos::MDRangePolicy> mdrange_policy(exec, {0, 0}, + {1, 1}); + ASSERT_EQ(mdrange_policy.space(), exec); + Kokkos::TeamPolicy team_policy(exec, 1, 1); + ASSERT_EQ(team_policy.space(), exec); +} + template void check_distinctive([[maybe_unused]] ExecSpace exec1, [[maybe_unused]] ExecSpace exec2) { @@ -89,6 +100,9 @@ void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { void test_partitioning(std::vector& instances) { check_distinctive(instances[0], instances[1]); + check_space_member_for_policies(instances[0]); + check_space_member_for_policies(instances[1]); + int sum1, sum2; int N = 3910; run_threaded_test( From 44fde213fb9515adfeb6645fbdbca5a4deeca633 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 18 Apr 2024 10:34:25 -0400 Subject: [PATCH 379/432] Use Kokkos::AUTO for OpenMPTarget --- core/unit_test/TestExecSpacePartitioning.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestExecSpacePartitioning.hpp b/core/unit_test/TestExecSpacePartitioning.hpp index 8703cc3b273..f8b570ab64d 100644 --- a/core/unit_test/TestExecSpacePartitioning.hpp +++ b/core/unit_test/TestExecSpacePartitioning.hpp @@ -35,7 +35,7 @@ void check_space_member_for_policies(const ExecSpace& exec) { Kokkos::MDRangePolicy> mdrange_policy(exec, {0, 0}, {1, 1}); ASSERT_EQ(mdrange_policy.space(), exec); - Kokkos::TeamPolicy team_policy(exec, 1, 1); + Kokkos::TeamPolicy team_policy(exec, 1, Kokkos::AUTO); ASSERT_EQ(team_policy.space(), exec); } From 8706b68d5bcb66473f180e131696e3d520bd34a7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 Apr 2024 13:56:19 -0500 Subject: [PATCH 380/432] kokkos_swap(Array) member friend should not be templated on some other type U Co-Authored-By: Maarten Arnst --- core/src/Kokkos_Array.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 7fd81030ecd..d1132bdbb4e 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -135,7 +135,6 @@ struct Array { } private: - template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> kokkos_swap(Array& a, From 86f5988b3128cd751da53d0b0c1af87d4ff7324a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 Apr 2024 13:57:54 -0500 Subject: [PATCH 381/432] Fix noexcept specification for kokkos_swap on zero-sized arrays Co-authored-by: Nevin Liber --- core/src/Kokkos_Array.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index d1132bdbb4e..09681c18842 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -198,7 +198,7 @@ struct Array { private: friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( - Array&, Array&) noexcept(Impl::is_nothrow_swappable_v) {} + Array&, Array&) noexcept {} }; #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 From 205fd156d990138dd6b6b400fb44d4aa9b196aa0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 15:07:03 +0000 Subject: [PATCH 382/432] Replace deprecated sycl::device_ptr/sycl::host_ptr --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 ++++-- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 34 ++++++++--------- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 13 ++++--- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 13 ++++--- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 20 +++++----- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 37 ++++++++++--------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 23 ++++++------ core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 4 +- 9 files changed, 84 insertions(+), 73 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 4a1c910c73d..d2112e3e4f2 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,7 +166,7 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::device_ptr SYCLInternal::resize_team_scratch_space( +sycl::ext::intel::device_ptr SYCLInternal::resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race @@ -251,7 +251,8 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { +sycl::ext::intel::device_ptr SYCLInternal::scratch_space( + const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); @@ -271,7 +272,8 @@ sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { return m_scratchSpace; } -sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { +sycl::ext::intel::host_ptr SYCLInternal::scratch_host( + const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); @@ -291,7 +293,8 @@ sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { return m_scratchHost; } -sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { +sycl::ext::intel::device_ptr SYCLInternal::scratch_flags( + const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index ab7e8ce71e0..0666e1bd626 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,12 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::device_ptr scratch_space(const std::size_t size); - sycl::device_ptr scratch_flags(const std::size_t size); - sycl::host_ptr scratch_host(const std::size_t size); + sycl::ext::intel::device_ptr scratch_space(const std::size_t size); + sycl::ext::intel::device_ptr scratch_flags(const std::size_t size); + sycl::ext::intel::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, - std::int64_t bytes, - bool force_shrink = false); + sycl::ext::intel::device_ptr resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; @@ -59,21 +58,22 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + sycl::ext::intel::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::ext::intel::host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + sycl::ext::intel::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space - static constexpr int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; - mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable sycl::ext::intel::device_ptr + m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index ecb4a863da2..b1d32172d82 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::device_ptr m_global_scratch_ptr; + sycl::ext::intel::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,7 +72,8 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -161,10 +162,10 @@ class Kokkos::Impl::ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = static_cast>( + space.resize_team_scratch_space( m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index f55280e22e3..8fec299c5d5 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl::ext::intel::device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,7 +114,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -155,14 +155,15 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { sycl::local_accessor local_mem( diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 5333e3c8a83..7feb2110068 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl::ext::intel::device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,7 +88,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { @@ -125,13 +125,15 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr results_ptr, int values_per_thread) { + sycl::ext::intel::device_ptr results_ptr, + int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -301,9 +303,9 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( - sizeof(value_type) * value_count * n_wgroups)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 27165c59e3a..edb1b54b827 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + sycl::ext::intel::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -94,9 +94,9 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( - sizeof(value_type) * std::max(value_count, 1u))); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -113,7 +113,8 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -156,8 +157,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues @@ -170,12 +172,13 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::ext::intel::device_ptr const global_scratch_ptr = + m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::device_ptr results_ptr) { + sycl::ext::intel::device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -330,9 +333,9 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); - results_ptr = - static_cast>(instance.scratch_space( - sizeof(value_type) * std::max(value_count, 1u) * init_size)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -425,10 +428,10 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = static_cast>( + space.resize_team_scratch_space( m_scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 58cfea6a97a..b773af6cda7 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::host_ptr m_scratch_host = nullptr; + sycl::ext::intel::host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -166,8 +166,9 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = + static_cast>( + instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); @@ -175,8 +176,8 @@ class ParallelScanSYCLBase { auto scan_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::device_ptr global_mem_, - sycl::device_ptr group_results_) { + sycl::ext::intel::device_ptr global_mem_, + sycl::ext::intel::device_ptr group_results_) { auto lambda = [=](sycl::nd_item<1> item) { auto global_mem = global_mem_; auto group_results = group_results_; @@ -253,8 +254,8 @@ class ParallelScanSYCLBase { size_t wgroup_size; size_t n_wgroups; - sycl::device_ptr global_mem; - sycl::device_ptr group_results; + sycl::ext::intel::device_ptr global_mem; + sycl::ext::intel::device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -288,10 +289,10 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = - static_cast>(instance.scratch_space( - n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( + global_mem = static_cast>( + instance.scratch_space(n_wgroups * (wgroup_size + 1) * + sizeof(value_type))); + m_scratch_host = static_cast>( instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 2b4c2be5227..715d65a98f2 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::device_ptr scratch_level_1_ptr, + sycl::ext::intel::device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 7069805a5b5..3b818490901 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::device_ptr results_ptr, + sycl::ext::intel::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,7 +100,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::device_ptr results_ptr, + ValueType local_value, sycl::ext::intel::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); From 5932685c939a08b1b29b9c56bcb264728c1b16ba Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 17:12:16 +0000 Subject: [PATCH 383/432] Introduce alias based on feature macro --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 +++--- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 24 ++++++------ .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 17 ++++---- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 17 ++++---- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 20 +++++----- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 39 ++++++++++--------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 19 ++++----- core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 5 ++- core/src/setup/Kokkos_Setup_SYCL.hpp | 17 ++++++++ 10 files changed, 101 insertions(+), 70 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index d2112e3e4f2..adfd4c10b04 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,8 +166,9 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::ext::intel::device_ptr SYCLInternal::resize_team_scratch_space( - int scratch_pool_id, std::int64_t bytes, bool force_shrink) { +Kokkos::Impl::SYCLTypes::device_ptr +SYCLInternal::resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, + bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. @@ -251,7 +252,7 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::ext::intel::device_ptr SYCLInternal::scratch_space( +Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { @@ -272,7 +273,7 @@ sycl::ext::intel::device_ptr SYCLInternal::scratch_space( return m_scratchSpace; } -sycl::ext::intel::host_ptr SYCLInternal::scratch_host( +Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { @@ -293,7 +294,7 @@ sycl::ext::intel::host_ptr SYCLInternal::scratch_host( return m_scratchHost; } -sycl::ext::intel::device_ptr SYCLInternal::scratch_flags( +Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 0666e1bd626..de77b8efdeb 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,11 +43,13 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - sycl::ext::intel::device_ptr scratch_space(const std::size_t size); - sycl::ext::intel::device_ptr scratch_flags(const std::size_t size); - sycl::ext::intel::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::SYCLTypes::device_ptr scratch_space( + const std::size_t size); + Kokkos::Impl::SYCLTypes::device_ptr scratch_flags( + const std::size_t size); + Kokkos::Impl::SYCLTypes::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::ext::intel::device_ptr resize_team_scratch_space( + Kokkos::Impl::SYCLTypes::device_ptr resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); @@ -58,19 +60,19 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::ext::intel::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - sycl::ext::intel::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::ext::intel::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::SYCLTypes::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::SYCLTypes::host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::SYCLTypes::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space static constexpr int m_n_team_scratch = 10; mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::ext::intel::device_ptr + mutable Kokkos::Impl::SYCLTypes::device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; mutable int m_current_team_scratch = 0; mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index b1d32172d82..7f258ecccae 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::ext::intel::device_ptr m_global_scratch_ptr; + Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,7 +72,7 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::ext::intel::device_ptr const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { @@ -162,12 +162,13 @@ class Kokkos::Impl::ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = static_cast>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = + static_cast>( + space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 8fec299c5d5..155f4b00821 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + Kokkos::Impl::SYCLTypes::device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,8 +114,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -155,14 +156,16 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); auto scratch_flags = - static_cast>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 7feb2110068..c00f9bb6232 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + Kokkos::Impl::SYCLTypes::device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,8 +88,9 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { const auto begin = policy.begin(); @@ -126,13 +127,13 @@ class Kokkos::Impl::ParallelReduce>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::ext::intel::device_ptr results_ptr, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); @@ -303,9 +304,10 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * + n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index edb1b54b827..b9be1148832 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -94,9 +94,10 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u))); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -113,7 +114,7 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -158,7 +159,7 @@ class Kokkos::Impl::ParallelReduce>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -172,13 +173,13 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = + Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - sycl::ext::intel::device_ptr results_ptr) { + Kokkos::Impl::SYCLTypes::device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -333,9 +334,10 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); - results_ptr = static_cast>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u) * init_size)); + results_ptr = + static_cast>( + instance.scratch_space(sizeof(value_type) * + std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -428,12 +430,13 @@ class Kokkos::Impl::ParallelReduce>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + auto& space = *m_policy.space().impl_internal_space_instance(); + m_scratch_pool_id = space.acquire_team_scratch_space(); + m_global_scratch_ptr = + static_cast>( + space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index b773af6cda7..a3efe56b99c 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl::ext::intel::host_ptr m_scratch_host = nullptr; + Kokkos::Impl::SYCLTypes::host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -167,7 +167,7 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); auto scratch_flags = - static_cast>( + static_cast>( instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); @@ -176,8 +176,8 @@ class ParallelScanSYCLBase { auto scan_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl::ext::intel::device_ptr global_mem_, - sycl::ext::intel::device_ptr group_results_) { + Kokkos::Impl::SYCLTypes::device_ptr global_mem_, + Kokkos::Impl::SYCLTypes::device_ptr group_results_) { auto lambda = [=](sycl::nd_item<1> item) { auto global_mem = global_mem_; auto group_results = group_results_; @@ -254,8 +254,8 @@ class ParallelScanSYCLBase { size_t wgroup_size; size_t n_wgroups; - sycl::ext::intel::device_ptr global_mem; - sycl::ext::intel::device_ptr group_results; + Kokkos::Impl::SYCLTypes::device_ptr global_mem; + Kokkos::Impl::SYCLTypes::device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -289,11 +289,12 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = static_cast>( + global_mem = static_cast>( instance.scratch_space(n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( - instance.scratch_host(sizeof(value_type))); + m_scratch_host = + static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 715d65a98f2..d838dc94c67 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::ext::intel::device_ptr scratch_level_1_ptr, + Kokkos::Impl::SYCLTypes::device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 3b818490901..f25dec76777 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl::ext::intel::device_ptr results_ptr, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,7 +100,8 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl::ext::intel::device_ptr results_ptr, + ValueType local_value, + Kokkos::Impl::SYCLTypes::device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 30f6fa2ad23..7fb10bb39a2 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -45,4 +45,21 @@ #define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif +// FIXME_SYCL Use type directly once it has stabilized in SYCL. +namespace Kokkos::Impl::SYCLTypes { +#ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES +#error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! +#elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 +template +using device_ptr = sycl::ext::intel::device_ptr; +template +using host_ptr = sycl::ext::intel::host_ptr; +#else +template +using device_ptr = sycl::device_ptr; +template +using host_ptr = sycl::host_ptr; +#endif +} // namespace Kokkos::Impl::SYCLTypes + #endif From a7827731cf8256a9387a8786555aa5f97dccd17a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 22 Apr 2024 19:07:13 -0400 Subject: [PATCH 384/432] Kokkos::Impl::SYCLTypes:: -> Kokkos::Impl::sycl_ --- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 11 +- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 24 ++- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 12 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 20 +-- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 22 +-- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 34 ++-- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 166 +++++++++--------- core/src/SYCL/Kokkos_SYCL_Team.hpp | 2 +- .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 5 +- core/src/setup/Kokkos_Setup_SYCL.hpp | 12 +- 10 files changed, 142 insertions(+), 166 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index adfd4c10b04..5843dca8123 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -166,9 +166,8 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -Kokkos::Impl::SYCLTypes::device_ptr -SYCLInternal::resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, - bool force_shrink) { +Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. @@ -252,7 +251,7 @@ void SYCLInternal::finalize() { m_queue.reset(); } -Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { @@ -273,7 +272,7 @@ Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_space( return m_scratchSpace; } -Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( +Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { @@ -294,7 +293,7 @@ Kokkos::Impl::SYCLTypes::host_ptr SYCLInternal::scratch_host( return m_scratchHost; } -Kokkos::Impl::SYCLTypes::device_ptr SYCLInternal::scratch_flags( +Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index de77b8efdeb..2d784ef8a5f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -43,13 +43,11 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - Kokkos::Impl::SYCLTypes::device_ptr scratch_space( - const std::size_t size); - Kokkos::Impl::SYCLTypes::device_ptr scratch_flags( - const std::size_t size); - Kokkos::Impl::SYCLTypes::host_ptr scratch_host(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); + Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); + Kokkos::Impl::sycl_host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - Kokkos::Impl::SYCLTypes::device_ptr resize_team_scratch_space( + Kokkos::Impl::sycl_device_ptr resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); @@ -60,19 +58,19 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - Kokkos::Impl::SYCLTypes::device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - Kokkos::Impl::SYCLTypes::host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - Kokkos::Impl::SYCLTypes::device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::sycl_host_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::sycl_device_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space static constexpr int m_n_team_scratch = 10; mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable Kokkos::Impl::SYCLTypes::device_ptr + mutable Kokkos::Impl::sycl_device_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; mutable int m_current_team_scratch = 0; mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 7f258ecccae..d98f4837315 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -44,7 +44,7 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - Kokkos::Impl::SYCLTypes::device_ptr m_global_scratch_ptr; + sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; // Only let one ParallelFor instance at a time use the team scratch memory. // The constructor acquires the mutex which is released in the destructor. @@ -72,8 +72,7 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - Kokkos::Impl::SYCLTypes::device_ptr const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -165,10 +164,9 @@ class Kokkos::Impl::ParallelFor, auto& space = *m_policy.space().impl_internal_space_instance(); m_scratch_pool_id = space.acquire_team_scratch_space(); m_global_scratch_ptr = - static_cast>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + static_cast>(space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 155f4b00821..c1414ee0581 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -94,10 +94,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl_device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -114,9 +114,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -156,17 +155,14 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = - static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { sycl::local_accessor local_mem( diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index c00f9bb6232..dbe2366b8bc 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -69,10 +69,10 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; + sycl_device_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -88,9 +88,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count)); + results_ptr = static_cast>( + instance.scratch_space(sizeof(value_type) * value_count)); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { const auto begin = policy.begin(); @@ -126,15 +125,13 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, - int values_per_thread) { + sycl_device_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -305,9 +302,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * value_count * - n_wgroups)); + static_cast>(instance.scratch_space( + sizeof(value_type) * value_count * n_wgroups)); sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b9be1148832..1332fafde94 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; + sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; @@ -82,7 +82,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -95,9 +95,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u))); + static_cast>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -114,8 +113,7 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -158,9 +156,8 @@ class Kokkos::Impl::ParallelReduce>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues @@ -173,13 +170,12 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = - m_global_scratch_ptr; + sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr) { + sycl_device_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -335,9 +331,8 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); results_ptr = - static_cast>( - instance.scratch_space(sizeof(value_type) * - std::max(value_count, 1u) * init_size)); + static_cast>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u) * init_size)); size_t max_work_groups = 2 * @@ -433,10 +428,9 @@ class Kokkos::Impl::ParallelReduce>( - space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); + static_cast>(space.resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); if (static_cast(space.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index a3efe56b99c..bfc3fba7412 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -146,7 +146,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - Kokkos::Impl::SYCLTypes::host_ptr m_scratch_host = nullptr; + sycl_host_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -166,96 +166,93 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = - static_cast>( - instance.scratch_flags(sizeof(unsigned int))); + auto scratch_flags = static_cast>( + instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); // Initialize global memory - auto scan_lambda_factory = - [&](sycl::local_accessor local_mem, - sycl::local_accessor num_teams_done, - Kokkos::Impl::SYCLTypes::device_ptr global_mem_, - Kokkos::Impl::SYCLTypes::device_ptr group_results_) { - auto lambda = [=](sycl::nd_item<1> item) { - auto global_mem = global_mem_; - auto group_results = group_results_; - - const CombinedFunctorReducer< - FunctorType, typename Analysis::Reducer>& functor_reducer = - functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - const auto n_wgroups = item.get_group_range()[0]; - const int wgroup_size = item.get_local_range()[0]; - - const int local_id = item.get_local_linear_id(); - const index_type global_id = item.get_global_linear_id(); - - // Initialize local memory - value_type local_value; - reducer.init(&local_value); - if (global_id < size) { - if constexpr (std::is_void::value) - functor(global_id + begin, local_value, false); - else - functor(WorkTag(), global_id + begin, local_value, false); - } + auto scan_lambda_factory = [&](sycl::local_accessor local_mem, + sycl::local_accessor + num_teams_done, + sycl_device_ptr global_mem_, + sycl_device_ptr group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; + + const CombinedFunctorReducer& + functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); + + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; + + const int local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); + + // Initialize local memory + value_type local_value; + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void::value) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); + } - workgroup_scan<>(item, reducer, local_mem, local_value, - wgroup_size); + workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); - // Write results to global memory - if (global_id < size) global_mem[global_id] = local_value; + // Write results to global memory + if (global_id < size) global_mem[global_id] = local_value; - if (local_id == wgroup_size - 1) { - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; - sycl::atomic_ref - scratch_flags_ref(*scratch_flags); - num_teams_done[0] = ++scratch_flags_ref; - } - item.barrier(sycl::access::fence_space::global_space); - if (num_teams_done[0] == n_wgroups) { - if (local_id == 0) *scratch_flags = 0; - value_type total; - reducer.init(&total); - - for (unsigned int offset = 0; offset < n_wgroups; - offset += wgroup_size) { - index_type id = local_id + offset; - if (id < static_cast(n_wgroups)) - local_value = group_results[id]; - else - reducer.init(&local_value); - workgroup_scan<>( - item, reducer, local_mem, local_value, - std::min(n_wgroups - offset, wgroup_size)); - if (id < static_cast(n_wgroups)) { - reducer.join(&local_value, &total); - group_results[id] = local_value; - } - reducer.join( - &total, - &local_mem[item.get_sub_group().get_group_range()[0] - 1]); - if (offset + wgroup_size < n_wgroups) - item.barrier(sycl::access::fence_space::global_space); - } + sycl::atomic_ref + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + item.barrier(sycl::access::fence_space::global_space); + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); + if (id < static_cast(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; } - }; - return lambda; - }; + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + item.barrier(sycl::access::fence_space::global_space); + } + } + }; + return lambda; + }; size_t wgroup_size; size_t n_wgroups; - Kokkos::Impl::SYCLTypes::device_ptr global_mem; - Kokkos::Impl::SYCLTypes::device_ptr group_results; + sycl_device_ptr global_mem; + sycl_device_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -289,12 +286,11 @@ class ParallelScanSYCLBase { // that will contain the sum of the previous workgroups totals. // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass - global_mem = static_cast>( - instance.scratch_space(n_wgroups * (wgroup_size + 1) * - sizeof(value_type))); - m_scratch_host = - static_cast>( - instance.scratch_host(sizeof(value_type))); + global_mem = + static_cast>(instance.scratch_space( + n_wgroups * (wgroup_size + 1) * sizeof(value_type))); + m_scratch_host = static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index d838dc94c67..910e3602714 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -339,7 +339,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - Kokkos::Impl::SYCLTypes::device_ptr scratch_level_1_ptr, + sycl_device_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index f25dec76777..06be143ecca 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -28,7 +28,7 @@ inline constexpr bool use_shuffle_based_algorithm = template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, + sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -100,8 +100,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, - Kokkos::Impl::SYCLTypes::device_ptr results_ptr, + ValueType local_value, sycl_device_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 7fb10bb39a2..b117d75acb9 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -46,20 +46,20 @@ #endif // FIXME_SYCL Use type directly once it has stabilized in SYCL. -namespace Kokkos::Impl::SYCLTypes { +namespace Kokkos::Impl { #ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES #error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! #elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 template -using device_ptr = sycl::ext::intel::device_ptr; +using sycl_device_ptr = sycl::ext::intel::device_ptr; template -using host_ptr = sycl::ext::intel::host_ptr; +using sycl_host_ptr = sycl::ext::intel::host_ptr; #else template -using device_ptr = sycl::device_ptr; +using sycl_device_ptr = sycl::device_ptr; template -using host_ptr = sycl::host_ptr; +using sycl_host_ptr = sycl::host_ptr; #endif -} // namespace Kokkos::Impl::SYCLTypes +} // namespace Kokkos::Impl #endif From ab3cae4865aec2114a7fe21288fdab2916b92188 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 10:14:56 +0200 Subject: [PATCH 385/432] Fix wrong macro guards for deprecated Kokkos::pair specialization Co-Authored-By: Nicolas Morales --- core/src/Kokkos_Pair.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index d1bd11f7162..9c3516eb222 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -413,7 +413,7 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } -#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. From fafe861d0683cdde279a44dc8dc10b71d9866c30 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 14:55:20 +0200 Subject: [PATCH 386/432] Fix support for Kokkos::Array of const-qualified element type --- core/src/Kokkos_Array.hpp | 3 ++- core/unit_test/TestArray.cpp | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index dcba8a42484..29e5edd9baa 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -134,8 +134,9 @@ struct Array { } private: + template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - Impl::is_swappable::value> + Impl::is_swappable::value> kokkos_swap(Array& a, Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index e691d83ebe2..d20d355b792 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -120,6 +120,13 @@ static_assert(test_array_aggregate_initialization()); } } +constexpr bool test_array_const_qualified_element_type() { + Kokkos::Array a{255}; + return a[0] == 255; +} + +static_assert(test_array_const_qualified_element_type()); + // User-defined type providing a sepcialization of kokkos_swap struct MyInt { int i; From 63eef4623a84634f97b2761c354792e5c0613cd9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 24 Apr 2024 23:35:59 +0200 Subject: [PATCH 387/432] Try to fix the CUDA 11.0 build --- core/src/Kokkos_Array.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 29e5edd9baa..6ff27db061b 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -137,8 +137,13 @@ struct Array { template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> - kokkos_swap(Array& a, - Array& b) noexcept(Impl::is_nothrow_swappable_v) { + kokkos_swap( +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Array& a, Array& b +#else + Array& a, Array& b +#endif + ) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } From ebb1cb308a956a4b98f2a5eb26660ed8ca3fe6ad Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 07:43:16 +0200 Subject: [PATCH 388/432] Revert "Try to fix the CUDA 11.0 build" This reverts commit 63eef4623a84634f97b2761c354792e5c0613cd9. --- core/src/Kokkos_Array.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 6ff27db061b..29e5edd9baa 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -137,13 +137,8 @@ struct Array { template friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> - kokkos_swap( -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Array& a, Array& b -#else - Array& a, Array& b -#endif - ) noexcept(Impl::is_nothrow_swappable_v) { + kokkos_swap(Array& a, + Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } From 031f6d94a4294c767c4e049f9aa5fadee47c9ef3 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 08:11:46 +0200 Subject: [PATCH 389/432] Alternate definition of Impl::is_nothrow_swappable_v for NVCC version less than 11.4 --- core/src/Kokkos_Swap.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index 2f849a13ab6..fd69a8e6266 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -37,6 +37,26 @@ kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& namespace Impl { +// Workaround for the definition of is_nothrow_swappable_v +#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) +template +struct is_swappable { + template + static decltype(kokkos_swap(std::declval(), std::declval())) + test_swap(int) noexcept(noexcept(kokkos_swap(std::declval(), + std::declval()))); + struct Nope {}; // test_swap must return a complete type for the definition + // of nothrow below + template + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v(0)), Nope>; + static constexpr bool nothrow = noexcept(test_swap(0)); +}; + +template +inline constexpr bool is_nothrow_swappable_v = is_swappable::nothrow; +#else template struct is_swappable { template @@ -52,6 +72,7 @@ struct is_swappable { template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); +#endif } // namespace Impl From 2391f1765318725042dcdad6581eca6c03cb5adc Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 17:23:11 +0200 Subject: [PATCH 390/432] Avoid introducing a 2nd definition of the Impl::swappable trait Co-Authored-By: Daniel Arndt --- core/src/Kokkos_Swap.hpp | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index fd69a8e6266..907f8607a7e 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -37,26 +37,6 @@ kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& namespace Impl { -// Workaround for the definition of is_nothrow_swappable_v -#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) -template -struct is_swappable { - template - static decltype(kokkos_swap(std::declval(), std::declval())) - test_swap(int) noexcept(noexcept(kokkos_swap(std::declval(), - std::declval()))); - struct Nope {}; // test_swap must return a complete type for the definition - // of nothrow below - template - static Nope test_swap(long); - static constexpr bool value = - !std::is_same_v(0)), Nope>; - static constexpr bool nothrow = noexcept(test_swap(0)); -}; - -template -inline constexpr bool is_nothrow_swappable_v = is_swappable::nothrow; -#else template struct is_swappable { template @@ -69,6 +49,13 @@ struct is_swappable { !std::is_same_v(0)), Nope>; }; +#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) +template +inline constexpr bool is_nothrow_swappable_v = + is_swappable::value&& noexcept( + kokkos_swap(std::declval&>(), + std::declval&>())); +#else template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); From d434f87e91069bc0d0af020053f4ca7c3f3b80c4 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 25 Apr 2024 20:51:34 +0200 Subject: [PATCH 391/432] Do not require OpenMP support for languages other than CXX Specify CXX component when searching for OpenMP so that OpenMP support is not required for other languages with CMake. One caveat is that finding the OpenMP dependency downstream will require CMake minimum version of 3.10 https://cmake.org/cmake/help/latest/module/FindOpenMP.html Co-Authored-By: Luca Bertagna --- cmake/kokkos_tpls.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index 6ef3b79bde2..df01f200d13 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -103,13 +103,13 @@ if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) endif() IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED) + find_package(OpenMP REQUIRED COMPONENTS CXX) # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency # so we just append the flags here instead of linking with the OpenMP target. IF(KOKKOS_HAS_TRILINOS) COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED) + KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) ENDIF() ENDIF() From 19ca9ce97a80bbf9f43353b22c09c437f1389384 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 26 Apr 2024 09:47:04 -0600 Subject: [PATCH 392/432] Update version --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f4252437111..d0cf6696f89 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.2.01](https://github.com/kokkos/kokkos/releases/tag/4.2.01). +The current release is [4.3.00](https://github.com/kokkos/kokkos/releases/tag/4.3.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.2.01.tar.gz +wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: From 9686392118b72205251a0c9511c5fceacc1b6db8 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Fri, 26 Apr 2024 09:52:03 -0600 Subject: [PATCH 393/432] Add Linux Foundation notice and fix C++ standard --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d0cf6696f89..7d9d70fac5c 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ backends in development. **Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** +Kokkos is a [Linux Foundation](https://linuxfoundation.org) project. + ## Learning about Kokkos To start learning about Kokkos: @@ -44,7 +46,7 @@ git clone -b develop https://github.com/kokkos/kokkos.git ### Building Kokkos -To build Kokkos, you will need to have a C++ compiler that supports C++14 or later. +To build Kokkos, you will need to have a C++ compiler that supports C++17 or later. All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). From 7e7709fdb8029e6e97f5a5f8549c3a42a4ddbdbd Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 27 Apr 2024 12:32:21 -0400 Subject: [PATCH 394/432] SYCL: Avoid deprecated floating-point number abs overloads (#6959) * Avoid deprecated floating-point number abs overloads * Add a comment --- core/src/Kokkos_MathematicalFunctions.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index 3fead8dd293..19967782e5e 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -277,12 +277,20 @@ KOKKOS_INLINE_FUNCTION long long abs(long long n) { #endif } KOKKOS_INLINE_FUNCTION float abs(float x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } KOKKOS_INLINE_FUNCTION double abs(double x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } inline long double abs(long double x) { using std::abs; From 4ec82963fbcd174aab79a86875c85e09a4fcc170 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Sun, 28 Apr 2024 08:52:18 -0700 Subject: [PATCH 395/432] OpenMPTarget: Update loop order in MDRange (#6925) * OpenMPTarget: Reverse loop order in MDRange. * OpenMPTarget: Honor user request for iteration in MDRange. * OpenMPTarget: clang-format * OpenMPTarget: Remove unecessary includes. --------- Co-authored-by: Rahulkumar Gayatri --- core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 3 +- .../Kokkos_OpenMPTarget_MDRangePolicy.hpp | 5 + ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 383 +++++++++++ ...s_OpenMPTarget_ParallelReduce_MDRange.hpp} | 633 ++++++++---------- 4 files changed, 674 insertions(+), 350 deletions(-) create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp rename core/src/OpenMPTarget/{Kokkos_OpenMPTarget_Parallel_MDRange.hpp => Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp} (62%) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index ea4e7f6baba..84c7b85f11d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -146,7 +146,8 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ #include -#include +#include +#include #include /*--------------------------------------------------------------------------*/ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp index d718f56d38b..e353676b617 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp @@ -22,6 +22,10 @@ namespace Kokkos { namespace Impl { +using OpenMPTargetIterateLeft = std::integral_constant; +using OpenMPTargetIterateRight = + std::integral_constant; + template struct ThreadAndVectorNestLevel +#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + FunctorType functor(m_functor); + Policy policy = m_policy; + + typename Policy::point_type unused; + static_assert(1 < Policy::rank && Policy::rank < 7); + static_assert(Policy::inner_direction == Iterate::Left || + Policy::inner_direction == Iterate::Right); + + execute_tile( + unused, functor, policy, + std::integral_constant()); + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateRight) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i1 = begin_1; i1 < end_1; ++i1) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template + inline std::enable_if_t execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy, OpenMPTargetIterateLeft) const { + (void)offset; + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + // TODO DZP: based on a conversation with Christian, we're using 256 as a + // heuristic here. We need something better once we can query these kinds of + // properties + template + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp similarity index 62% rename from core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index 6878531730d..0782a79302a 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -14,128 +14,122 @@ // //@HEADER -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP #include #include -#include +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" #include -// WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, -// this was tracked down to a bug in clang with regards of mapping structs -// with arrays of long in it. Arrays of int might be fine though ... -#define KOKKOS_IMPL_MDRANGE_USE_NO_TILES // undef EOF - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { +template +class ParallelReduce, + Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; + using Policy = Kokkos::MDRangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using Index = typename Policy::index_type; - const FunctorType m_functor; + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + static constexpr bool UseReducer = + !std::is_same_v; + + const pointer_type m_result_ptr; + const CombinedFunctorReducerType m_functor_reducer; const Policy m_policy; + using ParReduceCopy = ParallelReduceCopy; + + bool m_result_ptr_on_device; + + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; + public: inline void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); - Policy policy = m_policy; - -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - typename Policy::point_type unused; - - execute_tile(unused, functor, policy); -#else - const int64_t begin = 0; - const int64_t end = m_policy.m_num_tiles; - -#pragma omp target teams distribute map(to : functor) num_teams(end - begin) - { - for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) { - -#pragma omp parallel - { - typename Policy::point_type offset; - if (Policy::outer_direction == Policy::Left) { - for (int i = 0; i < Policy::rank; ++i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } else { - for (int i = Policy::rank - 1; i >= 0; --i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } - execute_tile(offset, functor, policy); - } - } - } -#endif + execute_tile( + m_functor_reducer.get_functor(), m_policy, m_result_ptr, + std::integral_constant()); } - template + template + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_result_view) + : m_result_ptr(arg_result_view.data()), + m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to \ + : functor) \ + reduction(custom \ + : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - -#pragma omp for collapse(2) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + } else { +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ +reduction(+:result) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, result); + else + functor(typename Policy::work_tag(), i0, i1, result); + } } -#endif + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -144,107 +138,119 @@ class ParallelFor, const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join( \ + omp_out, omp_in)) \ + initializer( \ + OpenMPTargetReducerWrapper ::init( \ + omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - -#pragma omp for collapse(3) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + } else { +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ +reduction(+:result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_void::value) + functor(i0, i1, i2, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, result); + } } -#endif + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; + const Index begin_2 = policy.m_lower[3]; + const Index begin_3 = policy.m_lower[2]; const Index end_0 = policy.m_upper[0]; const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; const Index end_3 = policy.m_upper[3]; -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - -#pragma omp for collapse(4) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + } else { +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ +reduction(+:result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + } } -#endif + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -257,64 +263,65 @@ class ParallelFor, const Index end_3 = policy.m_upper[3]; const Index end_4 = policy.m_upper[4]; -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - -#pragma omp for collapse(5) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } else { +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ +reduction(+:result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + result); + } } -#endif + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template + template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -329,140 +336,69 @@ class ParallelFor, const Index end_4 = policy.m_upper[4]; const Index end_5 = policy.m_upper[5]; -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to \ + : functor) \ + reduction(custom \ + : result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); + functor(i0, i1, i2, i3, i4, i5, result); else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); } } } } } } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - - const ptrdiff_t begin_5 = offset[5]; - ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; - end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; - -#pragma omp for collapse(6) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) - for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5); + } else { +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ +reduction(+:result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + if constexpr (std::is_same::value) + functor(i0, i1, i2, i3, i4, i5, result); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + result); + } } -#endif - } - - inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - // TODO DZP: based on a conversation with Christian, we're using 256 as a - // heuristic here. We need something better once we can query these kinds of - // properties - template - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool UseReducer = - !std::is_same_v; - - const pointer_type m_result_ptr; - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - - using ParReduceCopy = ParallelReduceCopy; - - bool m_result_ptr_on_device; - - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; + } + } + } + } + } - public: - inline void execute() const { - execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } - template - inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - Policy arg_policy, const ViewType& arg_result_view) - : m_result_ptr(arg_result_view.data()), - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} - template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -509,9 +445,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -567,9 +503,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[3]; @@ -630,9 +566,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -701,9 +637,9 @@ reduction(+:result) } template - inline std::enable_if_t execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { + inline std::enable_if_t execute_tile( + const FunctorType& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -788,5 +724,4 @@ reduction(+:result) //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#undef KOKKOS_IMPL_MDRANGE_USE_NO_TILES -#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ +#endif /* KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP */ From 77ea52f97685908e62e11f10fe263f51f4fc0c46 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 30 Apr 2024 20:49:51 -0400 Subject: [PATCH 396/432] Threads: Don't silently allow m_instance to be a nullptr (#6969) * Threads: Don't silently allod m_instance to be a nullptr * Assert that m_instance is not nullptr --- core/src/Threads/Kokkos_Threads_Team.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index fd0f221365b..a3501a437d2 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -188,8 +188,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return value; - if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -229,8 +227,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return; - type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution @@ -285,8 +281,6 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -358,6 +352,7 @@ class ThreadsExecTeamMember { m_chunk_size(team.chunk_size()), m_league_chunk_end(0), m_team_alloc(team.team_alloc()) { + KOKKOS_ASSERT(m_instance != nullptr); if (team.league_size()) { // Execution is using device-team interface: From f699a2c7a2668832e74747ed4816bef683937ba7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 30 Apr 2024 21:44:09 -0400 Subject: [PATCH 397/432] Fix enabling OpenMP with HIP and "compile as CMake language" --- cmake/kokkos_tpls.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index df01f200d13..c7c352ae35f 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -111,6 +111,9 @@ IF (Kokkos_ENABLE_OPENMP) ELSE() KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) ENDIF() + IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + ENDIF() ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) From 2574b802922d2d13cdd80b2b171ff5f3cd5ef15b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 30 Apr 2024 21:49:46 -0400 Subject: [PATCH 398/432] Fix OpenMP+CUDA when `Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE` is `ON` Co-Authored-By: Daniel Arndt --- cmake/kokkos_tpls.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index c7c352ae35f..cda9e0d6004 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -114,6 +114,9 @@ IF (Kokkos_ENABLE_OPENMP) IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) ENDIF() + IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + ENDIF() ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) From ccd0126b88a08b27c86f11ecd0447454b96b0b52 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 May 2024 12:03:36 -0400 Subject: [PATCH 399/432] Fix fedora CI builds with flang-new --- example/build_cmake_installed/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/example/build_cmake_installed/CMakeLists.txt b/example/build_cmake_installed/CMakeLists.txt index aaf745b418d..c025f1d7d28 100644 --- a/example/build_cmake_installed/CMakeLists.txt +++ b/example/build_cmake_installed/CMakeLists.txt @@ -12,6 +12,7 @@ find_package(Kokkos REQUIRED) add_executable(example cmake_example.cpp foo.f) if(CMAKE_Fortran_COMPILER_ID STREQUAL LLVMFlang) set_target_properties(example PROPERTIES LINKER_LANGUAGE Fortran) + target_link_options(example PRIVATE -fno-fortran-main) endif() # This is the only thing required to set up compiler/linker flags From 45a14049163732fbc5eb249282d15424ebb91d55 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 2 May 2024 09:22:10 -0600 Subject: [PATCH 400/432] Fix Copyright file --- Copyright.txt | 49 ++++++++----------------------------------------- LICENSE | 10 ---------- 2 files changed, 8 insertions(+), 51 deletions(-) diff --git a/Copyright.txt b/Copyright.txt index 5e2f8d8647b..cbba3efc7bc 100644 --- a/Copyright.txt +++ b/Copyright.txt @@ -1,41 +1,8 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER +************************************************************************ + + Kokkos v. 4.0 + Copyright (2022) National Technology & Engineering + Solutions of Sandia, LLC (NTESS). + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/LICENSE b/LICENSE index 6572cc2db05..4d9d69d7c44 100644 --- a/LICENSE +++ b/LICENSE @@ -1,13 +1,3 @@ - ************************************************************************ - - Kokkos v. 4.0 - Copyright (2022) National Technology & Engineering - Solutions of Sandia, LLC (NTESS). - - Under the terms of Contract DE-NA0003525 with NTESS, - the U.S. Government retains certain rights in this software. - - ============================================================================== Kokkos is under the Apache License v2.0 with LLVM Exceptions: ============================================================================== From c6d86474a83c460e9de37bba938b63f5d9580070 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 May 2024 16:10:07 -0400 Subject: [PATCH 401/432] Also use is_nothrow_swappable workaround for Intel Classic Compilers (#6983) * Also use is_nothrow_swappable workaround for Intel Classic Compilers * Use template parameter U directly in kokkos_swap overload --- core/src/Kokkos_Array.hpp | 2 +- core/src/Kokkos_Swap.hpp | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 29e5edd9baa..0a1ced93c8f 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -138,7 +138,7 @@ struct Array { friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< Impl::is_swappable::value> kokkos_swap(Array& a, - Array& b) noexcept(Impl::is_nothrow_swappable_v) { + Array& b) noexcept(Impl::is_nothrow_swappable_v) { for (std::size_t i = 0; i < N; ++i) { kokkos_swap(a[i], b[i]); } diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp index 907f8607a7e..2f849a13ab6 100644 --- a/core/src/Kokkos_Swap.hpp +++ b/core/src/Kokkos_Swap.hpp @@ -49,17 +49,9 @@ struct is_swappable { !std::is_same_v(0)), Nope>; }; -#if defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140) -template -inline constexpr bool is_nothrow_swappable_v = - is_swappable::value&& noexcept( - kokkos_swap(std::declval&>(), - std::declval&>())); -#else template inline constexpr bool is_nothrow_swappable_v = noexcept(kokkos_swap(std::declval(), std::declval())); -#endif } // namespace Impl From 69567f3051dbdc51e65221c8b68ee41849749a4f Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 3 May 2024 08:57:09 -0400 Subject: [PATCH 402/432] Add thread-safety tests (#6938) * Add thread-safety tests * Disable thread-safety tests for Serial and OpenMP for now * Cleanup include and namespace * Skip tests for OpenACC in CMakeLists.txt * Avoid std::move * Comment on tests * Use more atomics * Simplify test --- core/unit_test/CMakeLists.txt | 7 + core/unit_test/TestExecSpaceThreadSafety.hpp | 319 +++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 core/unit_test/TestExecSpaceThreadSafety.hpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 3b14bec03a2..4d0ce3b22e3 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -148,6 +148,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Crs DeepCopyAlignment ExecSpacePartitioning + ExecSpaceThreadSafety ExecutionSpace FunctorAnalysis HostSharedPtr @@ -426,6 +427,7 @@ if(Kokkos_ENABLE_OPENACC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecSpaceThreadSafety.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp @@ -637,6 +639,8 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) endif() if(Kokkos_ENABLE_SERIAL) + list(REMOVE_ITEM Serial_SOURCES1 + ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_ExecSpaceThreadSafety.cpp) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_Serial1 SOURCES @@ -667,6 +671,9 @@ if(Kokkos_ENABLE_THREADS) endif() if (Kokkos_ENABLE_OPENMP) + list(REMOVE_ITEM OpenMP_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_ExecSpaceThreadSafety.cpp) + set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp ) diff --git a/core/unit_test/TestExecSpaceThreadSafety.hpp b/core/unit_test/TestExecSpaceThreadSafety.hpp new file mode 100644 index 00000000000..20b802babe0 --- /dev/null +++ b/core/unit_test/TestExecSpaceThreadSafety.hpp @@ -0,0 +1,319 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +#ifdef KOKKOS_ENABLE_OPENMP +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { +#pragma omp parallel num_threads(2) + { + if (omp_get_thread_num() == 0) l1(); + if (omp_get_thread_num() == 1) l2(); + } +} +// We cannot run the multithreaded test when threads or HPX is enabled because +// we cannot launch a thread from inside another thread +#elif !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_HPX) +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + std::thread t1(l1); + std::thread t2(l2); + t1.join(); + t2.join(); +} +#else +template +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + l1(); + l2(); +} +#endif + +// The idea for all of these tests is to access a View from kernels submitted by +// two different threads to the same execution space instance. If the kernels +// are executed concurrently, we expect to count too many increments. +void run_exec_space_thread_safety_range() { + constexpr int N = 10000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::RangePolicy(exec, 0, 1), KOKKOS_LAMBDA(int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_range(); +} + +void run_exec_space_thread_safety_mdrange() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_mdrange(); +} + +void run_exec_space_thread_safety_team_policy() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member) { + Kokkos::single(Kokkos::PerTeam(team_member), [=]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + run_exec_space_thread_safety_team_policy(); +} + +void run_exec_space_thread_safety_range_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_reduce) { + run_exec_space_thread_safety_range_reduce(); +} + +void run_exec_space_thread_safety_mdrange_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange_reduce) { +// FIXME_INTEL +#ifdef KOKKOS_COMPILER_INTEL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail for OpenMP using the " + "legacy Intel compiler"; +#endif + run_exec_space_thread_safety_mdrange_reduce(); +} + +void run_exec_space_thread_safety_team_policy_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::TeamPolicy(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy::member_type + &team_member, + int &update) { + Kokkos::single(Kokkos::PerTeam(team_member), [=, &update]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }); + }, + error); + } + }; + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy_reduce) { +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + // FIXME_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is know to fail with SYCL+Cuda"; +#endif + run_exec_space_thread_safety_team_policy_reduce(); +} + +void run_exec_space_thread_safety_range_scan() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View view("view"); + Kokkos::View error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_scan( + Kokkos::RangePolicy(exec, 0, 1), + KOKKOS_LAMBDA(int, int &, const bool final) { + if (final) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + } + }); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_scan) { + run_exec_space_thread_safety_range_scan(); +} + +} // namespace From 9c7920291d7fc100ed94133da0dbe2412c8d2f05 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Fri, 3 May 2024 18:30:04 -0400 Subject: [PATCH 403/432] Fix deprecation warnings with GCC for pair comparison operators Co-Authored-By: Andrey Prokopenko --- core/src/Kokkos_Pair.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 9c3516eb222..2b7f275d06d 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -450,37 +450,37 @@ struct KOKKOS_DEPRECATED pair { // template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( const pair& lhs, const pair& rhs) { return lhs.first == rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( const pair& lhs, const pair& rhs) { return !(lhs == rhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( const pair& lhs, const pair& rhs) { return lhs.first < rhs.first; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( const pair& lhs, const pair& rhs) { return !(rhs < lhs); } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( const pair& lhs, const pair& rhs) { return rhs < lhs; } template -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } From 7b8e3a68fcbf3a8deef67dbf5287c5331c73df3a Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 17:23:34 +0000 Subject: [PATCH 404/432] Fix TPL_LIBRARY_SUFFIXES for 32-bit build --- cmake/kokkos_functions.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/kokkos_functions.cmake b/cmake/kokkos_functions.cmake index 9dab1ca00ea..d1f1e0d7a78 100644 --- a/cmake/kokkos_functions.cmake +++ b/cmake/kokkos_functions.cmake @@ -709,7 +709,12 @@ MACRO(kokkos_find_imported NAME) ENDIF() IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib lib64) + SET(TPL_LIBRARY_SUFFIXES lib) + IF(KOKKOS_IMPL_32BIT) + LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) + ELSE() + LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) + ENDIF() ENDIF() SET(${NAME}_INCLUDE_DIRS) From 28260178f4d68fb2bfb7ccfc8d7239e264b1d166 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 13:34:15 -0400 Subject: [PATCH 405/432] Avoid duplicated definition of KOKKOS_IMPL_32BIT --- .github/workflows/continuous-integration-workflow-32bit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 87c21d3a6e7..0260cb5894a 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -36,7 +36,7 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \ + -DCMAKE_CXX_FLAGS="-Werror -m32" \ -DCMAKE_CXX_COMPILER=g++ \ -DCMAKE_BUILD_TYPE=RelWithDebInfo - name: Build From ccadc7d9ba2be086be38a8f1731c5df9339fbe06 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 6 May 2024 14:15:50 -0400 Subject: [PATCH 406/432] Disable failing parallel_scan_with_reducers test --- core/unit_test/TestTeamVector.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 5e16539d652..e278789992f 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -1067,6 +1067,10 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) { } #endif +#ifdef KOKKOS_IMPL_32BIT + GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT"; // FIXME_32BIT +#endif + checkScan>() .run(); From d61d75aceceacf4c3b5a6463626f58826ce47849 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Wed, 8 May 2024 07:41:50 -0400 Subject: [PATCH 407/432] Fix a bug when using realloc on views of non-default constructible element types (#6993) * Add few missing constexpr for alloc_prop_input Co-authored-by: Daniel Arndt * Update tests * Fix DualView * Address review comments * Add missing decorators * Move NoDefaultConstructor out of function --------- Co-authored-by: Daniel Arndt --- containers/src/Kokkos_DualView.hpp | 8 ++++---- core/src/Kokkos_CopyViews.hpp | 10 ++++++++-- core/unit_test/TestRealloc.hpp | 13 +++++++++++++ core/unit_test/TestResize.hpp | 13 +++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index e821570a8d5..1fb174943fe 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -944,13 +944,13 @@ class DualView : public ViewTraits { if (sizeMismatch) { ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, typename t_host::memory_space(), d_view); } - } else if (alloc_prop_input::initialize) { + } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -1038,7 +1038,7 @@ class DualView : public ViewTraits { /* Resize on Device */ if (sizeMismatch) { ::Kokkos::resize(properties, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { h_view = create_mirror_view(typename t_host::memory_space(), d_view); } else { h_view = create_mirror_view(Kokkos::WithoutInitializing, @@ -1054,7 +1054,7 @@ class DualView : public ViewTraits { /* Resize on Host */ if (sizeMismatch) { ::Kokkos::resize(properties, h_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { + if constexpr (alloc_prop_input::initialize) { d_view = create_mirror_view(typename t_dev::memory_space(), h_view); } else { diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index ee8d1e09d3a..40fdd590f6f 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -3235,7 +3235,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, v = view_type(); // Best effort to deallocate in case no other view refers // to the shared allocation v = view_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); @@ -3330,7 +3333,10 @@ impl_realloc(Kokkos::View& v, if (v.layout() != layout) { v = view_type(); // Deallocate first, if the only view to allocation v = view_type(arg_prop, layout); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property(arg_prop); diff --git a/core/unit_test/TestRealloc.hpp b/core/unit_test/TestRealloc.hpp index 2c9dc5ee473..f30c9e15e1c 100644 --- a/core/unit_test/TestRealloc.hpp +++ b/core/unit_test/TestRealloc.hpp @@ -144,6 +144,11 @@ void impl_testRealloc() { EXPECT_EQ(oldPointer, newPointer); } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; template void testRealloc() { @@ -154,6 +159,14 @@ void testRealloc() { impl_testRealloc(); // without data initialization } + // Check #6992 fix (no default initialization in realloc without initializing) + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + realloc_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewRealloc diff --git a/core/unit_test/TestResize.hpp b/core/unit_test/TestResize.hpp index 13d7e16d589..3102d2b9a16 100644 --- a/core/unit_test/TestResize.hpp +++ b/core/unit_test/TestResize.hpp @@ -358,6 +358,12 @@ void impl_testResize() { } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; + template void testResize() { { @@ -367,6 +373,13 @@ void testResize() { impl_testResize(); // without data initialization } + { + using view_type = Kokkos::View; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + resize_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewResize From 50a862cf63d532d3de6d7dc2767279d0725cd05d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 May 2024 12:05:05 -0400 Subject: [PATCH 408/432] SYCL: Prepare Parallel* for Graphs (#6988) * SYCL: Make Parallel* copyable * Address review comments * Refactor Team policies further * Fix alias for SYCL TeamPolicy ParallelReduce * Improve const-correctness in Kokkos_SYCL_ParallelReduce_Team * Fix up Kokkos_SYCL_ParallelReduce_Team.hpp --- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 6 -- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 6 -- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 53 ++++++------ .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 14 ++-- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 14 ++-- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 84 +++++++++---------- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 14 ++-- 7 files changed, 83 insertions(+), 108 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index 7fbf5420f83..b58885192b9 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -181,12 +181,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy), diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..2f8db922d3d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -137,12 +137,6 @@ class Kokkos::Impl::ParallelFor, functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} }; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index d98f4837315..57ff97e7f31 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -28,7 +28,7 @@ template class Kokkos::Impl::ParallelFor, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using functor_type = FunctorType; using size_type = ::Kokkos::Experimental::SYCL::size_type; @@ -44,19 +44,14 @@ class Kokkos::Impl::ParallelFor, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl_device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor instance at a time use the team scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; template - sycl::event sycl_direct_launch(const Policy& policy, + sycl::event sycl_direct_launch(const sycl_device_ptr global_scratch_ptr, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -72,7 +67,6 @@ class Kokkos::Impl::ParallelFor, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl_device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( @@ -125,17 +119,31 @@ class Kokkos::Impl::ParallelFor, inline void execute() const { if (m_league_size == 0) return; - auto& space = *m_policy.space().impl_internal_space_instance(); + auto& instance = *m_policy.space().impl_internal_space_instance(); + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = space.get_indirect_kernel_mem(); + indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( m_functor, indirectKernelMem); - sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, + sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); - space.register_team_scratch_event(m_scratch_pool_id, event); + instance.register_team_scratch_event(scratch_pool_id, event); } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -143,10 +151,7 @@ class Kokkos::Impl::ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = @@ -159,22 +164,14 @@ class Kokkos::Impl::ParallelFor, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const auto& instance = *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index c1414ee0581..79f8afd4a3d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -77,9 +77,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -330,6 +328,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -349,10 +353,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index dbe2366b8bc..2bad7749759 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -50,9 +50,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_scratch_buffers_lock( - p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} + typename View::memory_space>::accessible) {} private: template @@ -347,6 +345,12 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -366,10 +370,6 @@ class Kokkos::Impl::ParallelReduce m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 1332fafde94..43c6ca44019 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -30,7 +30,7 @@ class Kokkos::Impl::ParallelReduce, Kokkos::Experimental::SYCL> { public: - using Policy = TeamPolicyInternal; + using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; @@ -54,24 +54,18 @@ class Kokkos::Impl::ParallelReduce m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; const size_type m_vector_size; - // Only let one ParallelReduce instance at a time use the team scratch memory - // and the host scratch memory. The constructor acquires the mutex which is - // released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - int m_scratch_pool_id = -1; - template + template sycl::event sycl_direct_launch( - const PolicyType& policy, + const sycl_device_ptr global_scratch_ptr, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); + const Kokkos::Experimental::SYCL& space = m_policy.space(); Kokkos::Experimental::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -113,7 +107,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -170,7 +163,6 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = @@ -386,6 +378,22 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( + instance.m_mutexScratchSpace); + std::scoped_lock team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast(m_scratch_size[1]) * m_league_size)); + using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -395,14 +403,24 @@ class Kokkos::Impl::ParallelReduce + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = m_policy.team_size_recommended( @@ -423,22 +441,15 @@ class Kokkos::Impl::ParallelReduce>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * m_league_size)); - - if (static_cast(space.m_maxShmemPerBlock) < + const Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_policy.space().impl_internal_space_instance(); + if (static_cast(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } @@ -448,25 +459,6 @@ class Kokkos::Impl::ParallelReduce requested too large team size."); } - - public: - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_buffers_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - initialize(); - } }; #endif diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bfc3fba7412..b3d3e9e35ce 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -150,10 +150,6 @@ class ParallelScanSYCLBase { pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one ParallelScan instance at a time use the host scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_buffers_lock; - private: template sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, @@ -367,6 +363,11 @@ class ParallelScanSYCLBase { auto& instance = *m_policy.space().impl_internal_space_instance(); + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock scratch_buffers_lock( + instance.m_mutexScratchSpace); + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); @@ -385,10 +386,7 @@ class ParallelScanSYCLBase { : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_scratch_buffers_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} }; } // namespace Kokkos::Impl From f5b34222c166c71e86ea44fd7867d443ba25856e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 May 2024 13:27:28 -0400 Subject: [PATCH 409/432] SYCL: Fix deprecation in custom parallel_for RangePolicy implementation --- core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index b4de7eb89ff..341c6c335d4 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -43,8 +43,8 @@ template struct FunctorWrapperRangePolicyParallelForCustom { using WorkTag = typename Policy::work_tag; - void operator()(sycl::item<1> item) const { - const typename Policy::index_type id = item.get_linear_id(); + void operator()(sycl::nd_item<1> item) const { + const typename Policy::index_type id = item.get_global_linear_id(); if (id < m_work_size) { const auto shifted_id = id + m_begin; if constexpr (std::is_void_v) From 37986fde4cee878aa4d9f60ae11f7ea6c80976ff Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 8 May 2024 14:18:41 -0600 Subject: [PATCH 410/432] [ci skip] update changelog for 4.3.1 (#6995) * [ci skip] update changelog for 4.3.1 * changelog: fixup --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c70ee5505f8..f8d288db5da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # CHANGELOG +## [4.3.01](https://github.com/kokkos/kokkos/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.00...4.3.01) + +### Backend and Architecture Enhancements: + +#### HIP: +* MI300 support unified memory [\#6877](https://github.com/kokkos/kokkos/pull/6877) + +### Bug Fixes +* Serial: Use the provided execution space instance in TeamPolicy [\#6951](https://github.com/kokkos/kokkos/pull/6951) +* `nvcc_wrapper`: bring back support for `--fmad` option [\#6931](https://github.com/kokkos/kokkos/pull/6931) +* Fix CUDA reduction overflow for `RangePolicy` [\#6578](https://github.com/kokkos/kokkos/pull/6578) + ## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) From 7cad3e7c3b66bf3e6f4de75bd9043abde9f8194a Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Wed, 8 May 2024 14:01:34 -0700 Subject: [PATCH 411/432] OpenMPTarget: Use mutex lock for parallel scan. --- .../Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index c1f7851f413..c886c397966 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -48,6 +48,10 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; + // Only let one ParallelScan instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; + template std::enable_if_t::value> call_with_tag( const FunctorType& f, const idx_type& idx, value_type& val, @@ -197,7 +201,8 @@ class ParallelScan, : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} //---------------------------------------- }; From 00170ae80cb54b39dd11f77dcef4318c754afd5a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 May 2024 11:33:01 -0400 Subject: [PATCH 412/432] Remove cuSPARSE TPL It looks like an oversight. It is unused. Example code that referred to it was removed in #2688 because it was just sitting there, i.e. not built nor tested. KokkosKernels has its own CMake logic to find it and link against it. --- cmake/Dependencies.cmake | 1 - cmake/deps/CUDA.cmake | 1 - cmake/deps/CUSPARSE.cmake | 26 -------------------------- cmake/tpls/FindTPLCUSPARSE.cmake | 26 -------------------------- 4 files changed, 54 deletions(-) delete mode 100644 cmake/deps/CUSPARSE.cmake delete mode 100644 cmake/tpls/FindTPLCUSPARSE.cmake diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 611c089b2e3..fb1e73b5799 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - TEST_OPTIONAL_TPLS CUSPARSE ) TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/cmake/deps/CUDA.cmake b/cmake/deps/CUDA.cmake index 68bf5b3d579..5b6afd61512 100644 --- a/cmake/deps/CUDA.cmake +++ b/cmake/deps/CUDA.cmake @@ -35,7 +35,6 @@ IF(NOT _CUDA_FAILURE) GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) ELSE() SET(TPL_ENABLE_CUDA OFF) ENDIF() diff --git a/cmake/deps/CUSPARSE.cmake b/cmake/deps/CUSPARSE.cmake deleted file mode 100644 index b016971ab91..00000000000 --- a/cmake/deps/CUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ************************************************************************ -# @HEADER - -#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) - -#IF (TPL_ENABLE_CUDA) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) -# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -# KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -#ENDIF() - diff --git a/cmake/tpls/FindTPLCUSPARSE.cmake b/cmake/tpls/FindTPLCUSPARSE.cmake deleted file mode 100644 index 4709f8002b1..00000000000 --- a/cmake/tpls/FindTPLCUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#@HEADER - -# Check for CUDA support - -IF (NOT TPL_ENABLE_CUDA) - MESSAGE(FATAL_ERROR "\nCUSPARSE requires CUDA") -ELSE() - GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) - GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -ENDIF() - From 1d9d0df2eecfce635fe5c77559eb17adfa128d04 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 10 May 2024 11:49:19 -0400 Subject: [PATCH 413/432] SYCL: Print submission command queue property (#7004) * SYCL: Print submission command queue property * Also print SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS if set * Reword printout for environment variable Co-authored-by: Damien L-G --------- Co-authored-by: Damien L-G --- core/src/SYCL/Kokkos_SYCL.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 9a246f7642f..de5ddf405d4 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -110,6 +110,26 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif +#ifdef SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST + if (sycl_queue() + .has_property< + sycl::ext::intel::property::queue::immediate_command_list>()) + os << "Immediate command lists enforced\n"; + else if (sycl_queue() + .has_property()) + os << "Standard command queue enforced\n"; + else +#endif + { + os << "Immediate command lists and standard command queue allowed.\n"; + if (const char* environment_setting = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS")) + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=" + << environment_setting << " takes precedence.\n"; + else + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS not defined.\n"; + } int counter = 0; int active_device = Kokkos::device_id(); From cadab6c1ed26cfef885fdb29c6d3eace98862f3e Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 7 May 2024 17:35:21 -0400 Subject: [PATCH 414/432] Test DualView resize/realloc for types without default constructor --- containers/unit_tests/TestDualView.hpp | 61 ++++++++++++++++++++------ 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index a15e5fa2997..ecb06d1c652 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -282,15 +282,20 @@ struct test_dualview_resize { const unsigned int m = 5; const unsigned int factor = 2; - ViewType a("A", n, m); + ViewType a; + if constexpr (Initialize) + a = ViewType("A", n, m); + else + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::deep_copy(a.d_view, 1); /* Covers case "Resize on Device" */ a.modify_device(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); - else + if constexpr (Initialize) Kokkos::resize(a, factor * n, factor * m); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); ASSERT_EQ(a.extent(0), n * factor); ASSERT_EQ(a.extent(1), m * factor); @@ -305,7 +310,7 @@ struct test_dualview_resize { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, a.d_view.extent(0)), SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::Sum(a_d_sum)); // Check host view is synced as expected scalar_type a_h_sum = 0; @@ -321,10 +326,10 @@ struct test_dualview_resize { /* Covers case "Resize on Host" */ a.modify_host(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); - else + if constexpr (Initialize) Kokkos::resize(a, n / factor, m / factor); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); ASSERT_EQ(a.extent(0), n / factor); ASSERT_EQ(a.extent(1), m / factor); @@ -339,7 +344,7 @@ struct test_dualview_resize { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, a.d_view.extent(0)), SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::Sum(a_d_sum)); // Check host view is synced as expected a_h_sum = 0; @@ -369,13 +374,17 @@ struct test_dualview_realloc { const unsigned int n = 10; const unsigned int m = 5; - ViewType a("A", n, m); - if (Initialize) - Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); - else + ViewType a; + if constexpr (Initialize) { + a = ViewType("A", n, m); Kokkos::realloc(a, n, m); + } else { + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); + } Kokkos::deep_copy(a.d_view, 1); + a.modify_device(); a.sync_host(); @@ -387,7 +396,7 @@ struct test_dualview_realloc { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, a.d_view.extent(0)), SumViewEntriesFunctor(a.d_view), - a_d_sum); + Kokkos::Sum(a_d_sum)); // Check host view is synced as expected scalar_type a_h_sum = 0; @@ -463,12 +472,36 @@ TEST(TEST_CATEGORY, dualview_deep_copy) { test_dualview_deep_copy(); } +struct NoDefaultConstructor { + NoDefaultConstructor(int i_) : i(i_) {} + + operator int() const { return i; } + NoDefaultConstructor& operator+=(const NoDefaultConstructor& other) { + i += other.i; + return *this; + } + + int i; +}; +} // namespace Test + +template <> +struct Kokkos::reduction_identity { + static Test::NoDefaultConstructor sum() { return {0}; } +}; + +namespace Test { + TEST(TEST_CATEGORY, dualview_realloc) { test_dualview_realloc(); + Impl::test_dualview_realloc(); } TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); + Impl::test_dualview_resize(); } namespace { From df018d97f52a7bfa29cf7d29fbda42f244001f40 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 13 May 2024 22:59:25 +0200 Subject: [PATCH 415/432] Suppress deprecated warnings via pragma push/pop in the tests (#6999) * Introduce `KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_{PUSH,POP} macros to suppress diagnostics when appropriate * Suppress all deprecated warnings I can see in tests * Update EDG diag suppress to fix the Intel Compiler Classic and provide a fallback empty definition for the macros --- containers/unit_tests/TestVector.hpp | 2 ++ core/src/Kokkos_Macros.hpp | 25 +++++++++++++++++++ core/unit_test/TestArrayOps.hpp | 2 ++ .../incremental/Test01_execspace.hpp | 2 ++ simd/unit_tests/include/SIMDTesting_Ops.hpp | 2 ++ 5 files changed, 33 insertions(+) diff --git a/containers/unit_tests/TestVector.hpp b/containers/unit_tests/TestVector.hpp index a7d341b789d..19901a52ad5 100644 --- a/containers/unit_tests/TestVector.hpp +++ b/containers/unit_tests/TestVector.hpp @@ -21,6 +21,8 @@ #include #include #include +#include +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #include namespace Test { diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index b255d2a5195..27b32b15214 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -562,6 +562,31 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc)) #endif +// clang-format off +#if defined(__EDG__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning push") \ + _Pragma("warning disable 1478") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning pop") +#elif defined(__GNUC__) || defined(__clang__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning(push)") \ + _Pragma("warning(disable: 4996)") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning(pop)") +#else + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +// clang-format on + #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ diff --git a/core/unit_test/TestArrayOps.hpp b/core/unit_test/TestArrayOps.hpp index 387589fbe88..6b8e0f3aca3 100644 --- a/core/unit_test/TestArrayOps.hpp +++ b/core/unit_test/TestArrayOps.hpp @@ -112,6 +112,7 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array::contiguous>; @@ -390,6 +391,7 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif } // namespace diff --git a/core/unit_test/incremental/Test01_execspace.hpp b/core/unit_test/incremental/Test01_execspace.hpp index d7b2a57b442..a7fa26c7282 100644 --- a/core/unit_test/incremental/Test01_execspace.hpp +++ b/core/unit_test/incremental/Test01_execspace.hpp @@ -63,7 +63,9 @@ struct TestIncrExecSpace { ASSERT_GT(concurrency, 0); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() int in_parallel = ExecSpace::in_parallel(); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() ASSERT_FALSE(in_parallel); #endif diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index c587ccf3046..74141f25316 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -81,7 +81,9 @@ class absolutes { auto on_host(T const& a) const { if constexpr (std::is_signed_v) { #if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() return Kokkos::Experimental::abs(a); + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #else return Kokkos::abs(a); #endif From da8be22574e62efc901bc9da540a863d98b8492a Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Wed, 15 May 2024 15:21:52 -0400 Subject: [PATCH 416/432] This PR changes the default execution behavior of the parallel_for(team-policy) constructs in the OpenACC backend. - This PR handles a missing case not covered by the previous PR #6772 This PR also fixes the OpenACC backend error in the thread-safety test in PR #6938. --- core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp | 8 ++++++-- core/unit_test/CMakeLists.txt | 1 - 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp index 4fce680aef0..2b98018e3bb 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp @@ -44,10 +44,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang vector num_gangs(league_size) \ - vector_length(team_size* vector_length) copyin(a_functor) + vector_length(team_size* vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size * team_size * vector_length; i++) { int league_id = i / (team_size * vector_length); typename Policy::member_type team(league_id, league_size, team_size, @@ -145,10 +147,12 @@ class Kokkos::Impl::ParallelFor, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang num_gangs(league_size) num_workers(team_size) \ - vector_length(vector_length) copyin(a_functor) + vector_length(vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size; i++) { int league_id = i; typename Policy::member_type team(league_id, league_size, team_size, diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 4d0ce3b22e3..4344b74e5e7 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -427,7 +427,6 @@ if(Kokkos_ENABLE_OPENACC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecSpaceThreadSafety.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp From 2b7b98a1a6e9138389813c6e4115672459f02195 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 16 May 2024 13:30:18 +0000 Subject: [PATCH 417/432] Use parallel_for instead of parallel_reduce for check --- containers/unit_tests/TestDualView.hpp | 113 ++++++++++++------------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/containers/unit_tests/TestDualView.hpp b/containers/unit_tests/TestDualView.hpp index ecb06d1c652..2512cb5c491 100644 --- a/containers/unit_tests/TestDualView.hpp +++ b/containers/unit_tests/TestDualView.hpp @@ -55,8 +55,8 @@ struct test_dualview_alloc { bool result = false; test_dualview_alloc(unsigned int size) { - result = run_me >( - size, 3); + result = + run_me>(size, 3); } }; @@ -154,7 +154,7 @@ struct test_dualview_combinations { } test_dualview_combinations(unsigned int size, bool with_init) { - result = run_me >( + result = run_me>( size, 3, with_init); } }; @@ -253,21 +253,18 @@ struct test_dual_view_deep_copy { } // end run_me test_dual_view_deep_copy() { - run_me >(10, 5, - true); - run_me >(10, 5, - false); + run_me>(10, 5, true); + run_me>(10, 5, + false); // Test zero length but allocated (a.d_view.data!=nullptr but // a.d_view.span()==0) - run_me >(0, 5, true); - run_me >(0, 5, - false); + run_me>(0, 5, true); + run_me>(0, 5, false); // Test default constructed view - run_me >(-1, 5, - true); - run_me >(-1, 5, - false); + run_me>(-1, 5, true); + run_me>(-1, 5, + false); } }; @@ -303,25 +300,30 @@ struct test_dualview_resize { a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - Kokkos::Sum(a_d_sum)); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, a_d_sum); - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); /* Covers case "Resize on Host" */ a.modify_host(); @@ -337,30 +339,33 @@ struct test_dualview_resize { a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected - a_d_sum = 0; + Kokkos::deep_copy(errors_d, 0); // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - Kokkos::Sum(a_d_sum)); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - a_h_sum = 0; + errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_resize() { - run_me >(); + run_me>(); } }; @@ -382,6 +387,8 @@ struct test_dualview_realloc { a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); } + ASSERT_EQ(a.extent(0), n); + ASSERT_EQ(a.extent(1), m); Kokkos::deep_copy(a.d_view, 1); @@ -389,29 +396,34 @@ struct test_dualview_realloc { a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, a.d_view.extent(0)), - SumViewEntriesFunctor(a.d_view), - Kokkos::Sum(a_d_sum)); + Kokkos::View errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_realloc() { - run_me >(); + run_me>(); } }; @@ -474,23 +486,10 @@ TEST(TEST_CATEGORY, dualview_deep_copy) { struct NoDefaultConstructor { NoDefaultConstructor(int i_) : i(i_) {} - - operator int() const { return i; } - NoDefaultConstructor& operator+=(const NoDefaultConstructor& other) { - i += other.i; - return *this; - } + KOKKOS_FUNCTION operator int() const { return i; } int i; }; -} // namespace Test - -template <> -struct Kokkos::reduction_identity { - static Test::NoDefaultConstructor sum() { return {0}; } -}; - -namespace Test { TEST(TEST_CATEGORY, dualview_realloc) { test_dualview_realloc(); From fc4383ab6f7a200cd2557f68c042bf2b59e8fa97 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Sun, 19 May 2024 09:26:31 -0500 Subject: [PATCH 418/432] Fix unique_any_senders nvcc template deduction --- core/src/HPX/Kokkos_HPX.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 6d541a64148..1f3d0783449 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -153,7 +153,7 @@ void HPX::impl_instance_fence_locked(const std::string &name) const { auto &s = impl_get_sender(); hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } @@ -184,7 +184,7 @@ void HPX::impl_static_fence(const std::string &name) { } hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } From 226aecfb8c161042d88421ac0176aa3c6d697fb6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 20 May 2024 13:01:30 -0400 Subject: [PATCH 419/432] Properly guard deprecated `Kokkos_Vector.hpp` header self contained test (#7016) * Properly guard deprecated header self contained test registration * Unconditionally remove the Kokkos_Vector.hpp header self contained test * On second thought prefer guards * Fix typo disa[b]led Co-authored-by: Daniel Arndt --------- Co-authored-by: Daniel Arndt --- core/unit_test/headers_self_contained/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/unit_test/headers_self_contained/CMakeLists.txt b/core/unit_test/headers_self_contained/CMakeLists.txt index f792b03ed88..4c364ceee75 100644 --- a/core/unit_test/headers_self_contained/CMakeLists.txt +++ b/core/unit_test/headers_self_contained/CMakeLists.txt @@ -10,7 +10,8 @@ file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE ${BASE_DIR}/algorithms/src ${BASE_DIR}/algorithms/src/*.hpp) -if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) +# erroring out when deprecated code is disabled and raising warnings that are treated as errors in the CI otherwise +if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 OR Kokkos_ENABLE_DEPRECATION_WARNINGS) list(REMOVE_ITEM KOKKOS_CONTAINERS_HEADERS "Kokkos_Vector.hpp") endif() From 81b63c5c5e6bb1b076f64bf6f9c975bba2aeddc6 Mon Sep 17 00:00:00 2001 From: Nicolas Morales Date: Mon, 20 May 2024 14:36:59 -0700 Subject: [PATCH 420/432] mdspan converting constructors (#6830) This PR adds conversions between mdspan and View - a (for now internal) "natural" mdspan type is introduced for View - constructors which take that "natural" mdspan type are added - to_mdspan and operator mdspan are introduced to get an mdspan from View - both leverage the "natural" mdspan type - all of them are restricted to the layout types we actually can convert - some error checks happen at runtime regarding padded Kokkos::View - includes some necessary updates for mdspan ppl - note we expect to do a clean mdspan merge before release --- core/src/Kokkos_View.hpp | 97 ++++ .../src/View/MDSpan/Kokkos_MDSpan_Extents.hpp | 19 +- core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp | 148 +++++ core/src/impl/Kokkos_ViewMapping.hpp | 254 ++++++--- core/unit_test/CMakeLists.txt | 7 +- core/unit_test/TestMDSpanConversion.hpp | 504 ++++++++++++++++++ .../view/TestExtentsDatatypeConversion.cpp | 11 +- .../__p0009_bits/layout_stride.hpp | 44 ++ .../experimental/__p0009_bits/macros.hpp | 5 + .../__p2642_bits/layout_padded.hpp | 484 +++++++++-------- 10 files changed, 1263 insertions(+), 310 deletions(-) create mode 100644 core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp create mode 100644 core/unit_test/TestMDSpanConversion.hpp diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index 09c6e780ef5..a6c6c955b87 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -38,6 +38,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include +#include #endif #include @@ -372,6 +373,32 @@ struct ViewTraits { //------------------------------------ }; +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename Impl::LayoutFromArrayLayout::type; + using mdspan_type = + mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + /** \class View * \brief View to an array of data. * @@ -1722,6 +1749,76 @@ class View : public ViewTraits { "Layout is not constructible from extent arguments. Use " "overload taking a layout object instead."); } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#if defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L) + // FIXME C++20 reevaluate after determining minium compiler versions + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#if defined(__cpp_conditional_explicit) && \ + (__cpp_conditional_explicit >= 201806L) + // FIXME C++20 reevaluate after determining minium compiler versions + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template , + typename Impl::MDSpanViewTraits::mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template , + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = OtherAccessorType()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN }; template diff --git a/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp b/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp index 3846b52d239..29d1e00adfc 100644 --- a/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp +++ b/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp @@ -37,9 +37,6 @@ struct ViewDimension; template struct ViewDataType; -} // namespace Kokkos::Impl - -namespace Kokkos::Experimental::Impl { // A few things to note -- // - mdspan allows for 0-rank extents similarly to View, so we don't need @@ -106,6 +103,20 @@ struct DataTypeFromExtents { // Will cause a compile error if it is malformed (i.e. dynamic after static) using type = typename ::Kokkos::Impl::ViewDataType::type; }; -} // namespace Kokkos::Experimental::Impl + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping_impl( + const VM &view_mapping, std::index_sequence) { + return Extents{view_mapping.extent(Indices)...}; +} + +template +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping( + const VM &view_mapping) { + static_assert(Extents::rank() == VM::Rank); + return extents_from_view_mapping_impl( + view_mapping, std::make_index_sequence{}); +} +} // namespace Kokkos::Impl #endif // KOKKOS_EXPERIMENTAL_MDSPAN_EXTENTS_HPP diff --git a/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp new file mode 100644 index 00000000000..8073dee1eed --- /dev/null +++ b/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -0,0 +1,148 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP +#define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP + +#include "Kokkos_MDSpan_Extents.hpp" +#include + +namespace Kokkos::Impl { + +template +struct LayoutFromArrayLayout; + +template <> +struct LayoutFromArrayLayout { + using type = Experimental::layout_left_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = Experimental::layout_right_padded; +}; + +template <> +struct LayoutFromArrayLayout { + using type = layout_stride; +}; + +template +KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( + const typename MDSpanType::mapping_type &mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + constexpr auto rank = extents_type::rank(); + const auto &ext = mapping.extents(); + + static_assert(rank <= ARRAY_LAYOUT_MAX_RANK, + "Unsupported rank for mdspan (must be <= 8)"); + + if constexpr (std::is_same_v) { + return Kokkos::LayoutStride{ + rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 0 ? mapping.stride(0) : 0, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? mapping.stride(1) : 0, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? mapping.stride(2) : 0, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? mapping.stride(3) : 0, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? mapping.stride(4) : 0, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? mapping.stride(5) : 0, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? mapping.stride(6) : 0, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? mapping.stride(7) : 0, + }; + } else { + // FIXME: Kokkos Layouts don't store stride (it's in the mapping) + // We could conceivably fix this by adding an extra ViewCtorProp for + // an abritrary padding. For now we will check for this. + if constexpr (rank > 1 && + (std::is_same_v< + typename mapping_type::layout_type, + Experimental::layout_left_padded> || + std::is_same_v< + typename mapping_type::layout_type, + Experimental::layout_right_padded>)) { + [[maybe_unused]] constexpr size_t strided_index = + std::is_same_v> + ? 1 + : rank - 2; + [[maybe_unused]] constexpr size_t extent_index = + std::is_same_v> + ? 0 + : rank - 1; + KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); + } + + return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + } +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + // std::span is not available in C++17 (our current requirements), + // so we need to use the std::array constructor for layout mappings. + // FIXME When C++20 is available, we can use std::span here instead + std::size_t strides[VM::Rank]; + view_mapping.stride_fill(&strides[0]); + if constexpr (std::is_same_v) { + return mapping_type(Kokkos::mdspan_non_standard, + extents_from_view_mapping(view_mapping), + strides); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[1]); + } else if constexpr (VM::Rank > 1 && + std::is_same_v>) { + return mapping_type(extents_from_view_mapping(view_mapping), + strides[VM::Rank - 2]); + } else { + return mapping_type(extents_from_view_mapping(view_mapping)); + } +} + +} // namespace Kokkos::Impl + +#endif // KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 3217c76e380..c37112be896 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -647,34 +647,60 @@ struct ViewOffset< m_dim.N5 * m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // FIXME: The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_dim.N0; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_dim.N0; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements Stride with [ rank ] value is + // the total length + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -935,34 +961,59 @@ struct ViewOffset< m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_stride; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_stride; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1286,42 +1337,58 @@ struct ViewOffset< m_dim.N1; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; n *= m_dim.N1; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = n; } - s[dimension_type::rank] = n * m_dim.N0; + return n * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1573,41 +1640,57 @@ struct ViewOffset< return m_stride; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride; } - s[dimension_type::rank] = m_stride * m_dim.N0; + return m_stride * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2133,34 +2216,50 @@ struct ViewOffset { return m_stride.S7; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - if (0 < dimension_type::rank) { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride.S0; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = m_stride.S1; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = m_stride.S2; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = m_stride.S3; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = m_stride.S4; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = m_stride.S5; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = m_stride.S6; } - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = m_stride.S7; } - s[dimension_type::rank] = span(); + return span(); + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2814,11 +2913,24 @@ class ViewMapping< return m_impl_offset.stride_7(); } + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements template KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { m_impl_offset.stride(s); } + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + template + KOKKOS_INLINE_FUNCTION iType stride_fill(iType* const s) const { + return m_impl_offset.stride_fill(s); + } + //---------------------------------------- // Range span diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 4344b74e5e7..5df8d1e2cf8 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -174,7 +174,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) endforeach() set(${Tag}_SOURCES1B) - foreach(Name + set(${Tag}_TESTNAMES1B MDRange_a MDRange_b MDRange_c @@ -185,6 +185,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDRangePolicyConstructors MDRangeReduce MDSpan + MDSpanConversion MinMaxClamp NumericTraits OccupancyControlTrait @@ -206,6 +207,10 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) SharedAlloc Swap ) + IF (NOT Kokkos_ENABLE_IMPL_MDSPAN) + LIST(REMOVE_ITEM ${Tag}_TESTNAMES1B MDSpanConversion) + ENDIF() + foreach(Name IN LISTS ${Tag}_TESTNAMES1B) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. diff --git a/core/unit_test/TestMDSpanConversion.hpp b/core/unit_test/TestMDSpanConversion.hpp new file mode 100644 index 00000000000..6519a7c277d --- /dev/null +++ b/core/unit_test/TestMDSpanConversion.hpp @@ -0,0 +1,504 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include +#include "experimental/__p0009_bits/layout_stride.hpp" + +namespace { + +template +struct TestViewMDSpanConversion { + using value_type = T; + + template + using layout_left_padded = Kokkos::Experimental::layout_left_padded; + + template + using layout_right_padded = + Kokkos::Experimental::layout_right_padded; + + struct TestAccessor { + using offset_policy = TestAccessor; + using element_type = value_type; + using reference = element_type &; + using data_handle_type = element_type *; + + constexpr TestAccessor() noexcept = default; + constexpr reference access(data_handle_type p, std::size_t i) noexcept { + return p[i]; + } + constexpr data_handle_type offset(data_handle_type p, + std::size_t i) noexcept { + return p + i; + } + }; + + template + static void test_conversion_from_mdspan( + Kokkos::View ref, + const MDSpanLayoutMapping &mapping) { + using unmanaged_view_type = + Kokkos::View>; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename unmanaged_view_type::traits>::mdspan_type; + using mapping_type = MDSpanLayoutMapping; + using mdspan_layout_type = typename MDSpanLayoutMapping::layout_type; + using extents_type = typename mapping_type::extents_type; + using mdspan_type = + Kokkos::mdspan; + + static_assert(std::is_constructible_v); + static_assert(std::is_convertible_v == + std::is_convertible_v); + // Manually create an mdspan from ref so we have a valid pointer to play + // with + const auto &exts = mapping.extents(); + auto mds = mdspan_type{ref.data(), mapping}; + + auto test_view = unmanaged_view_type(mds); + + ASSERT_EQ(test_view.data(), ref.data()); + ASSERT_EQ(test_view.data(), mds.data_handle()); + ASSERT_EQ(test_view.layout(), ref.layout()); + for (std::size_t r = 0; r < mdspan_type::rank(); ++r) { + ASSERT_EQ(test_view.extent(r), ref.extent(r)); + ASSERT_EQ(test_view.extent(r), exts.extent(r)); + } + } + + template + static void test_conversion_to_mdspan( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v) { + using view_type = ViewType; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename view_type::traits>::mdspan_type; + + static_assert(natural_mdspan_type::rank() == view_type::rank); + static_assert(std::is_same_v); + constexpr bool is_strided_layout = + std::is_same_v; + if constexpr (!is_strided_layout) { + static_assert(natural_mdspan_type::mapping_type::padding_value == + Kokkos::dynamic_extent); + } + // test conversion operator to natural mdspan + { + natural_mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + + if constexpr (!is_strided_layout && natural_mdspan_type::rank() > 1) { + ASSERT_EQ(cvt.mapping().stride(1), ref_layout_mapping.stride(1)); + } + } + // test to_mdspan() returning natural mdspan + { + auto cvt = v.to_mdspan(); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + // test conversion operator to different mdspan type + { + using mdspan_type = Kokkos::mdspan< + const typename natural_mdspan_type::element_type, + Kokkos::dextents, + typename natural_mdspan_type::layout_type, + typename natural_mdspan_type::accessor_type>; + mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + } + + template + static void test_conversion_to_mdspan_with_accessor( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v, + const AccessorType &a) { + auto cvt = v.to_mdspan(a); + static_assert(decltype(cvt)::rank() == ViewType::rank); + static_assert(std::is_same_v); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + + template + using natural_mdspan_type_for_view = typename Kokkos::Impl::MDSpanViewTraits< + typename ViewType::traits>::mdspan_type; + + static void run_test() { + // Verify we can only convert to compatible mdspans + static_assert(std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + static_assert( + std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Do not cast const away + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched dim + static_assert(!std::is_convertible_v< + Kokkos::View, + natural_mdspan_type_for_view>>); + + // Mismatched layouts + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + static_assert( + !std::is_convertible_v, + natural_mdspan_type_for_view>>); + // nvcc doesn't do CTAD properly here, making this way more verbose.. + // LayoutLeft + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7, 3)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutRight + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + test_conversion_from_mdspan( + Kokkos::View( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents>{ + Kokkos::dextents(3, 7)}); + test_conversion_from_mdspan( + Kokkos::View("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents>{ + Kokkos::extents()}); + + // LayoutStride + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, {}, strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::dextents{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan( + Kokkos::View( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping>{ + Kokkos::mdspan_non_standard, Kokkos::extents(), + strides}); + } + + // Conversion to mdspan + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4)); + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", + 4)); + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7)); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5})); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9})); + } + + // Aligned types (for padded layouts) + test_conversion_to_mdspan( + layout_left_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 127, 7)); + + test_conversion_to_mdspan( + layout_right_padded::mapping< + Kokkos::extents>({}, 128), + Kokkos::View( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 7, 127)); + + // Conversion with standard default_accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + Kokkos::default_accessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + Kokkos::default_accessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + Kokkos::default_accessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + Kokkos::default_accessor{}); + } + + // Conversion with a test accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 4), + Kokkos::View("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded::mapping< + Kokkos::extents>({}, 7), + Kokkos::View("v", 4, + 7), + TestAccessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5}), + TestAccessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + TestAccessor{}); + } + } +}; + +TEST(TEST_CATEGORY, view_mdspan_conversion) { + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); + TestViewMDSpanConversion::run_test(); +} + +} // namespace diff --git a/core/unit_test/view/TestExtentsDatatypeConversion.cpp b/core/unit_test/view/TestExtentsDatatypeConversion.cpp index b95890614e0..1b9b2a36819 100644 --- a/core/unit_test/view/TestExtentsDatatypeConversion.cpp +++ b/core/unit_test/view/TestExtentsDatatypeConversion.cpp @@ -23,15 +23,14 @@ namespace { // Helper to make static tests more succinct template -constexpr bool datatype_matches_extent = - std::is_same_v::type, - Extent>; +constexpr bool datatype_matches_extent = std::is_same_v< + typename Kokkos::Impl::ExtentsFromDataType::type, + Extent>; template constexpr bool extent_matches_datatype = - std::is_same_v::type>; + std::is_same_v::type>; // Conversion from DataType to extents // 0-rank view diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 15ad577d149..05fce8ba44c 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -199,6 +199,12 @@ struct layout_stride { return __strides_storage_t{static_cast(s[Idxs])...}; } + template + MDSPAN_INLINE_FUNCTION + static constexpr const __strides_storage_t fill_strides(mdspan_non_standard_tag, const IntegralType (&s)[extents_type::rank()]) { + return __strides_storage_t{static_cast(s[Idxs])...}; + } + #ifdef __cpp_lib_span template MDSPAN_INLINE_FUNCTION @@ -309,6 +315,44 @@ struct layout_stride { */ } + MDSPAN_TEMPLATE_REQUIRES( + class IntegralTypes, + /* requires */ ( + // MSVC 19.32 does not like using index_type here, requires the typename Extents::index_type + // error C2641: cannot deduce template arguments for 'MDSPAN_IMPL_STANDARD_NAMESPACE::layout_stride::mapping' + _MDSPAN_TRAIT(std::is_convertible, const std::remove_const_t&, typename Extents::index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t&) + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr + mapping( + mdspan_non_standard_tag, + extents_type const& e, + IntegralTypes (&s)[extents_type::rank()] + ) noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + e, __strides_storage_t(__impl::fill_strides(mdspan_non_standard, s)) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + { + /* + * TODO: check preconditions + * - s[i] > 0 is true for all i in the range [0, rank_ ). + * - REQUIRED-SPAN-SIZE(e, s) is a representable value of type index_type ([basic.fundamental]). + * - If rank_ is greater than 0, then there exists a permutation P of the integers in the + * range [0, rank_), such that s[ pi ] >= s[ piā€…āˆ’ā€…1 ] * e.extent( piā€…āˆ’ā€…1 ) is true for + * all i in the range [1, rank_ ), where pi is the ith element of P. + */ + } + #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES( class IntegralTypes, diff --git a/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp b/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp index 3eeb39755c8..523bca4e11d 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp @@ -629,3 +629,8 @@ struct __bools; // end Pre-C++14 constexpr }}}1 //============================================================================== + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +constexpr struct mdspan_non_standard_tag { +} mdspan_non_standard; +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp index a8014867923..1f5ad70a6cf 100644 --- a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp +++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -158,19 +158,21 @@ class layout_left_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { index_type indices[] = {static_cast(index_offsets)...}; // self-recursive fold trick from // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 @@ -241,62 +243,71 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_left::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; - * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + * This overload participates in overload resolution only if + * `is_constructible_v` is true. If + * `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, + * or `OtherExtents::static_extent(0)` must be `dynamic_extent`; otherwise, + * `OtherExtents::static_extent(0)` must be equal to the least multiple of + * `padding_value` greater than or equal to `extents_type::static_extent(0)` */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (static_padding_stride != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (static_padding_stride == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( - class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + class _OtherExtents, + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - } + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) - constexpr - mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -305,42 +316,43 @@ class layout_left_padded::mapping { /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) - constexpr - mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) - { + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; + ++r) { s[r] = value; value *= exts.extent(r); } @@ -349,12 +361,11 @@ class layout_left_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = padded_stride.value(0); @@ -375,40 +386,47 @@ class layout_left_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION + constexpr index_type stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == 0) return index_type(1); + if (r == 0) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + for (rank_type k = 1; k < r; k++) + value *= exts.extent(k); return value; } @@ -416,26 +434,26 @@ class layout_left_padded::mapping { /** * Equality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_left_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -444,17 +462,15 @@ class layout_left_padded::mapping { /** * Inequality operator between `layout_left_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( - class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif @@ -490,25 +506,27 @@ class layout_right_padded::mapping { typename padded_stride_type::static_array_type padded_stride = {}; extents_type exts = {}; - constexpr index_type compute_offset(std::index_sequence<>) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { return 0; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffset index_offset) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, IndexOffset index_offset) const { return index_offset; } template - constexpr index_type compute_offset(std::index_sequence, - IndexOffsets... index_offsets) const { + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { // self-recursive fold trick from // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 index_type res = 0; ((res = static_cast(index_offsets) + (Ranks == extent_to_pad_idx ? padded_stride.value(0) - : exts.extent(Ranks)) * + : exts.extent(Ranks)) * res), ...); return res; @@ -577,56 +595,62 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + /* requires */ (std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { - static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) - || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (padded_stride_type::static_value() != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (padded_stride_type::static_value() == + _OtherExtents::static_extent(extent_to_pad_idx))); } /** * Converting constructor from `layout_stride::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true + * This overload participates in overload resolution only if + * `is_constructible_v` is true */ MDSPAN_TEMPLATE_REQUIRES( class _OtherExtents, - /* requires */ ( - std::is_constructible_v - ) - ) + /* requires */ (std::is_constructible_v)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) {} /** * Converting constructor from `layout_right_padded::mapping`. * - * This overload participates in overload resolution only if `is_constructible_v` is true. - * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + * This overload participates in overload resolution only if + * `is_constructible_v` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && std::is_constructible_v - ) - ) + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) - : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), - exts(other_mapping.extents()) - { + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant{})), + exts(other_mapping.extents()) { static_assert(padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent || padding_value == _Mapping::padding_value); @@ -635,41 +659,42 @@ class layout_right_padded::mapping { /** * Converting constructor from `layout_left_padded::mapping`. * - * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_left_padded_mapping<_Mapping>::value - && extents_type::rank() <= 1 - && std::is_constructible_v - ) - ) - MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v)) + MDSPAN_INLINE_FUNCTION constexpr mapping(const _Mapping &other_mapping) noexcept - : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), - exts(other_mapping.extents()) - {} + : padded_stride(padded_stride_type::init_padding( + other_mapping.extents(), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} - constexpr const extents_type &extents() const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { return exts; } - constexpr std::array - strides() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr std::array + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { return {}; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return {1}; } else { index_type value = 1; std::array s{}; s[extent_to_pad_idx] = value; value *= padded_stride.value(0); - for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) - { + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) { s[r] = value; value *= exts.extent(r); } @@ -678,17 +703,15 @@ class layout_right_padded::mapping { } } - constexpr index_type - required_span_size() const noexcept - { - if constexpr ( extents_type::rank() == 0 ) { + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { return 1; - } else if constexpr ( extents_type::rank() == 1 ) { + } else if constexpr (extents_type::rank() == 1) { return exts.extent(0); } else { index_type value = 1; - for (rank_type r = 0; r < extent_to_pad_idx; ++r) - { + for (rank_type r = 0; r < extent_to_pad_idx; ++r) { value *= exts.extent(r); } return value * padded_stride.value(0); @@ -705,40 +728,47 @@ class layout_right_padded::mapping { */ MDSPAN_TEMPLATE_REQUIRES( class... _Indices, - /* requires */ ( - sizeof...(_Indices) == extents_type::rank() && - (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) - ) - ) - constexpr size_t operator()(_Indices... idxs) const noexcept - { + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); } - static constexpr bool is_always_unique() noexcept { return true; } - static constexpr bool is_always_exhaustive() noexcept - { - return (extents_type::rank() <= rank_type(1)) - || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent - && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; } - static constexpr bool is_always_strided() noexcept { return true; } - static constexpr bool is_unique() noexcept { return true; } - constexpr bool is_exhaustive() const noexcept - { - return (extents_type::rank() < 2) - || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; } - static constexpr bool is_strided() noexcept { return true; } - constexpr index_type stride(rank_type r) const noexcept - { + MDSPAN_INLINE_FUNCTION constexpr index_type + stride(rank_type r) const noexcept { assert(r < extents_type::rank()); - if(r == extents_type::rank() - 1) return index_type(1); + if (r == extents_type::rank() - 1) + return index_type(1); index_type value = padded_stride.value(0); - for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + for (rank_type k = extents_type::rank() - 2; k > r; k--) + value *= exts.extent(k); return value; } @@ -746,26 +776,26 @@ class layout_right_padded::mapping { /** * Equality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. * - * \note There is currently a difference from p2642r2, where this function is specified as taking - * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_right_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept - { - // Workaround for some compilers not short-circuiting properly with compile-time checks - // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping bool strides_equal = true; - if constexpr (extents_type::rank() > rank_type(1)) - { - strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); } return (left.extents() == right.extents()) && strides_equal; } @@ -774,17 +804,15 @@ class layout_right_padded::mapping { /** * Inequality operator between `layout_right_padded`s * - * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. */ MDSPAN_TEMPLATE_REQUIRES( class _Mapping, - /* requires */ ( - detail::is_layout_right_padded_mapping<_Mapping>::value - && (_Mapping::extents_type::rank() == extents_type::rank()) - ) - ) - friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept - { + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { return !(left == right); } #endif From 64fe756374d8ffe947b524905c8e20ea7ef7dbb0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 May 2024 17:47:49 -0400 Subject: [PATCH 421/432] SYCL: Don't use shuffles for top-level reductions (#7009) SYCL: Don't use shuffles for top-level reductions --- core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 06be143ecca..c838a1abc58 100644 --- a/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -21,9 +21,12 @@ namespace Kokkos::Impl::SYCLReduction { +// FIXME_SYCL For some types, shuffle reductions are competitive with local +// memory reductions but they are significantly slower for the value type used +// in combined reductions with multiple double arguments. template -inline constexpr bool use_shuffle_based_algorithm = - std::is_reference_v; +inline constexpr bool use_shuffle_based_algorithm = false; +// std::is_reference_v; template std::enable_if_t> workgroup_reduction( From 6aa2ad7da687095ded07167ca74f50d138625493 Mon Sep 17 00:00:00 2001 From: Alex Dutka <97711898+dutkalex@users.noreply.github.com> Date: Mon, 20 May 2024 23:52:21 +0200 Subject: [PATCH 422/432] Add a CITATION.cff file (#7008) * Create CITATION.cff * Update CITATION.cff * Update CITATION.cff Co-authored-by: Daniel Arndt * Update CITATION.cff Co-authored-by: Daniel Arndt * Add issue number [ci skip] --------- Co-authored-by: Daniel Arndt Co-authored-by: Damien L-G --- CITATION.cff | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000000..28c674c451b --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,65 @@ +cff-version: 1.2.0 +title: Kokkos +message: >- + If you use this software, please cite the overview paper +type: software +authors: + - name: The Kokkos authors + website: https://kokkos.org/community/team/ +identifiers: + - type: url + website: https://kokkos.org/kokkos-core-wiki/citation.html +repository-code: 'https://github.com/kokkos/kokkos' +url: 'https://kokkos.org/' +license: Apache-2.0 +preferred-citation: + type: article + authors: + - given-names: Christian R. + family-names: Trott + - given-names: Damien + family-names: Lebrun-GrandiĆ© + - given-names: Daniel + family-names: Arndt + - family-names: Ciesko + given-names: Jan + - given-names: Vinh + family-names: Dang + - family-names: Ellingwood + given-names: Nathan + - given-names: Rahulkumar + family-names: Gayatri + - given-names: Evan + family-names: Harvey + - given-names: Daisy S. + family-names: Hollman + - given-names: Dan + family-names: Ibanez + - given-names: Nevin + family-names: Liber + - given-names: Jonathan + family-names: Madsen + - given-names: Jeff + family-names: Miles + - given-names: David + family-names: Poliakoff + - given-names: Amy + family-names: Powell + - given-names: Sivasankaran + family-names: Rajamanickam + - given-names: Mikael + family-names: Simberg + - given-names: Dan + family-names: Sunderland + - given-names: Bruno + family-names: Turcksin + - given-names: Jeremiah + family-names: Wilke + doi: 10.1109/TPDS.2021.3097283 + journal: IEEE Transactions on Parallel and Distributed Systems + start: 805 + end: 817 + title: "Kokkos 3: Programming Model Extensions for the Exascale Era" + volume: 33 + issue: 4 + year: 2022 From f8f0cc473a53ad559326b38f05e560233c642239 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 May 2024 17:53:29 -0400 Subject: [PATCH 423/432] Always run Graph tests (#7011) * Always run Graph tests * Workaround for HPX * Move comment --- core/unit_test/CMakeLists.txt | 25 +------------------ core/unit_test/TestGraph.hpp | 22 +++++++++------- .../category_files/TestHPX_Category.hpp | 1 + .../category_files/TestOpenACC_Category.hpp | 1 + .../TestOpenMPTarget_Category.hpp | 1 + .../category_files/TestSYCL_Category.hpp | 1 + .../category_files/TestThreads_Category.hpp | 1 + core/unit_test/cuda/TestCuda_Graph.cpp | 18 ------------- core/unit_test/hip/TestHIP_Graph.cpp | 18 ------------- core/unit_test/openmp/TestOpenMP_Graph.cpp | 18 ------------- core/unit_test/serial/TestSerial_Graph.cpp | 18 ------------- 11 files changed, 19 insertions(+), 105 deletions(-) delete mode 100644 core/unit_test/cuda/TestCuda_Graph.cpp delete mode 100644 core/unit_test/hip/TestHIP_Graph.cpp delete mode 100644 core/unit_test/openmp/TestOpenMP_Graph.cpp delete mode 100644 core/unit_test/serial/TestSerial_Graph.cpp diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 5df8d1e2cf8..5f325ed4c12 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -151,6 +151,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ExecSpaceThreadSafety ExecutionSpace FunctorAnalysis + Graph HostSharedPtr HostSharedPtrAccessOnDevice Init @@ -658,12 +659,6 @@ if(Kokkos_ENABLE_SERIAL) UnitTestMainInit.cpp ${Serial_SOURCES2} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SerialGraph - SOURCES - UnitTestMainInit.cpp - serial/TestSerial_Graph.cpp - ) endif() if(Kokkos_ENABLE_THREADS) @@ -694,12 +689,6 @@ if (Kokkos_ENABLE_OPENMP) UnitTestMain.cpp openmp/TestOpenMP_InterOp.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMPGraph - SOURCES - UnitTestMainInit.cpp - openmp/TestOpenMP_Graph.cpp - ) endif() if(Kokkos_ENABLE_HPX) @@ -807,12 +796,6 @@ if(Kokkos_ENABLE_CUDA) UnitTestMainInit.cpp cuda/TestCuda_InterOp_StreamsMultiGPU.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaGraph - SOURCES - UnitTestMainInit.cpp - cuda/TestCuda_Graph.cpp - ) endif() if(Kokkos_ENABLE_HIP) @@ -840,12 +823,6 @@ if(Kokkos_ENABLE_HIP) UnitTestMain.cpp hip/TestHIP_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIPGraph - SOURCES - UnitTestMainInit.cpp - hip/TestHIP_Graph.cpp - ) endif() if(Kokkos_ENABLE_SYCL) diff --git a/core/unit_test/TestGraph.hpp b/core/unit_test/TestGraph.hpp index 9a36d08f445..45c86e50d39 100644 --- a/core/unit_test/TestGraph.hpp +++ b/core/unit_test/TestGraph.hpp @@ -66,7 +66,7 @@ struct SetResultToViewFunctor { } }; -struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { +struct TEST_CATEGORY_FIXTURE(graph) : public ::testing::Test { public: using count_functor = CountTestFunctor; using set_functor = SetViewToValueFunctor; @@ -88,7 +88,7 @@ struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { } }; -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one) { auto graph = Kokkos::Experimental::create_graph([&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); @@ -101,7 +101,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one_rvalue) { Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); }).submit(); @@ -112,7 +112,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); @@ -145,7 +145,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { ASSERT_EQ(0, bugs_host()); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), when_all_cycle) { view_type reduction_out{"reduction_out"}; view_host reduction_host{"reduction_host"}; Kokkos::Experimental::create_graph(ex, [&](auto root) { @@ -172,7 +172,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { // This test is disabled because we don't currently support copying to host, // even asynchronously. We _may_ want to do that eventually? -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), DISABLED_repeat_chain) { auto graph = Kokkos::Experimental::create_graph( ex, [&, count_host = count_host](auto root) { //---------------------------------------- @@ -198,7 +198,7 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { //---------------------------------------- } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), zero_work_reduce) { auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_reduce(0, set_result_functor{bugs}, count); }); @@ -214,9 +214,13 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { // UVM works on pre pascal cards. #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ (defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL)) - Kokkos::fence(); + if constexpr (std::is_same_v) Kokkos::fence(); +#endif +#ifdef KOKKOS_ENABLE_HPX // FIXME_HPX graph.submit() isn't properly enqueued + if constexpr (std::is_same_v) + Kokkos::fence(); #endif - graph.submit(); // should reset to 0, but doesn't + graph.submit(); Kokkos::deep_copy(ex, count_host, count); ex.fence(); ASSERT_EQ(count_host(), 0); diff --git a/core/unit_test/category_files/TestHPX_Category.hpp b/core/unit_test/category_files/TestHPX_Category.hpp index d3a7cdbea53..c6a2aa9f201 100644 --- a/core/unit_test/category_files/TestHPX_Category.hpp +++ b/core/unit_test/category_files/TestHPX_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 3 #define TEST_CATEGORY_DEATH hpx_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::HPX +#define TEST_CATEGORY_FIXTURE(name) hpx_##name #endif diff --git a/core/unit_test/category_files/TestOpenACC_Category.hpp b/core/unit_test/category_files/TestOpenACC_Category.hpp index 0c4e4b7e119..6105eadf14f 100644 --- a/core/unit_test/category_files/TestOpenACC_Category.hpp +++ b/core/unit_test/category_files/TestOpenACC_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 8 #define TEST_CATEGORY_DEATH openacc_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenACC +#define TEST_CATEGORY_FIXTURE(name) openacc_##name #endif diff --git a/core/unit_test/category_files/TestOpenMPTarget_Category.hpp b/core/unit_test/category_files/TestOpenMPTarget_Category.hpp index 235b34ffab7..921cff78902 100644 --- a/core/unit_test/category_files/TestOpenMPTarget_Category.hpp +++ b/core/unit_test/category_files/TestOpenMPTarget_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 4 #define TEST_CATEGORY_DEATH openmptarget_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget +#define TEST_CATEGORY_FIXTURE(name) openmptarget_##name #endif diff --git a/core/unit_test/category_files/TestSYCL_Category.hpp b/core/unit_test/category_files/TestSYCL_Category.hpp index 8e1b18c9acd..59e72c72c77 100644 --- a/core/unit_test/category_files/TestSYCL_Category.hpp +++ b/core/unit_test/category_files/TestSYCL_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 7 #define TEST_CATEGORY_DEATH sycl_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::SYCL +#define TEST_CATEGORY_FIXTURE(name) sycl_##name #endif diff --git a/core/unit_test/category_files/TestThreads_Category.hpp b/core/unit_test/category_files/TestThreads_Category.hpp index 13b0b653f21..ae8ac608339 100644 --- a/core/unit_test/category_files/TestThreads_Category.hpp +++ b/core/unit_test/category_files/TestThreads_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 1 #define TEST_CATEGORY_DEATH threads_DeathTest #define TEST_EXECSPACE Kokkos::Threads +#define TEST_CATEGORY_FIXTURE(name) threads_##name #endif diff --git a/core/unit_test/cuda/TestCuda_Graph.cpp b/core/unit_test/cuda/TestCuda_Graph.cpp deleted file mode 100644 index 27203639690..00000000000 --- a/core/unit_test/cuda/TestCuda_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/hip/TestHIP_Graph.cpp b/core/unit_test/hip/TestHIP_Graph.cpp deleted file mode 100644 index 405cb76c643..00000000000 --- a/core/unit_test/hip/TestHIP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/openmp/TestOpenMP_Graph.cpp b/core/unit_test/openmp/TestOpenMP_Graph.cpp deleted file mode 100644 index 22c8ab1bf8f..00000000000 --- a/core/unit_test/openmp/TestOpenMP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include diff --git a/core/unit_test/serial/TestSerial_Graph.cpp b/core/unit_test/serial/TestSerial_Graph.cpp deleted file mode 100644 index bff64d83e27..00000000000 --- a/core/unit_test/serial/TestSerial_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include From ce0915b5eeb0a78dae3e9fa743416f711ad798bb Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 May 2024 22:22:24 -0400 Subject: [PATCH 424/432] Fix undefined behavior in is_zero_byte (#7014) * Fix undefined behavior in is_zero_byte * Remove include file comments --- core/src/impl/Kokkos_ViewMapping.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index c37112be896..a0fe5c1a5d7 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -20,6 +20,8 @@ #include #include +#include +#include #include #include #include @@ -2538,9 +2540,10 @@ inline bool is_zero_byte(const T& t) { sizeof(T) % sizeof(int) == 0, int, std::conditional_t>>>; - const auto* const ptr = reinterpret_cast(&t); + auto bit_values = Kokkos::bit_cast< + Kokkos::Array>(t); for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i) - if (ptr[i] != 0) return false; + if (bit_values[i] != 0) return false; return true; } From fa8b501028b52b0141d4065a7197320bae207795 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 06:52:56 -0400 Subject: [PATCH 425/432] Disable OpenMPTarget Kokkos::Graph test (does not compile) --- core/unit_test/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 5f325ed4c12..413d4ef1c58 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -383,6 +383,7 @@ endforeach() # Disable non-compiling tests based on clang version. if(Kokkos_ENABLE_OPENMPTARGET) list(REMOVE_ITEM OpenMPTarget_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Graph.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp From bfe9aa2f1630f155d3b62d9873dcb7b993be9e29 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 07:14:06 -0400 Subject: [PATCH 426/432] Fixup for disabling deprecation warnings with NVC++ Fixup for #6999 Deprecation warnings are still showing in the OpenACC CI build ``` "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 136: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 217: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ Remark: individual warnings can be suppressed with "--diag_suppress " "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 197: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 217: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 274: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = Kokkos::Array::strided>; ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 286: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/unit_test/TestArrayOps.hpp", line 338: warning: class "Kokkos::Array" was declared deprecated [deprecated_entity] using A = Kokkos::Array::strided>; ^ "/var/jenkins/workspace/Kokkos_PR-7017/core/src/Kokkos_Array.hpp", line 286: note: because of a "deprecated" attribute struct KOKKOS_DEPRECATED ^ ``` The generic EDG warning disable did not work so we handle NVC++ separately and use diagnostic pragmas. In case anyone wants to try something else https://godbolt.org/z/nxWbPMT95 --- core/src/Kokkos_Macros.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 27b32b15214..ceca2130e75 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -563,7 +563,12 @@ static constexpr bool kokkos_omp_on_host() { return false; } #endif // clang-format off -#if defined(__EDG__) +#if defined(__NVCOMPILER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("diag_suppress 1216") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("diag_default 1216") +#elif defined(__EDG__) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ _Pragma("warning push") \ _Pragma("warning disable 1478") From f3bd253d3d09d0dfd6ab76125e8b56b840db542f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 11:38:36 -0400 Subject: [PATCH 427/432] Remove unused CudaInternal::cuda_{malloc,free}_async_wrapper --- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 24f4af31019..25aa6502152 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -421,23 +421,6 @@ class CudaInternal { return cudaStreamSynchronize(stream); } - // The following are only available for cuda 11.2 and greater -#if (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - template - cudaError_t cuda_malloc_async_wrapper(void** devPtr, size_t size, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaMallocAsync(devPtr, size, get_input_stream(hStream)); - } - - template - cudaError_t cuda_free_async_wrapper(void* devPtr, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaFreeAsync(devPtr, get_input_stream(hStream)); - } -#endif - // C++ API routines template cudaError_t cuda_func_get_attributes_wrapper(cudaFuncAttributes* attr, From 083fb014cc3abff24cbd042b86fbbbf77817bb25 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 21 May 2024 22:09:26 -0400 Subject: [PATCH 428/432] Improve `Impl::is_zero_byte()` (#7017) * Improve Impl::is_zero_byte() by implementing it in terms of std::memcmp * Prefer function scope for the all zeroes buffer Co-authored-by: Daniel Arndt --------- Co-authored-by: Daniel Arndt --- core/src/impl/Kokkos_ViewMapping.hpp | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index a0fe5c1a5d7..c1f4c0290c1 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -17,11 +17,10 @@ #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP #define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP +#include #include #include -#include -#include #include #include #include @@ -2531,20 +2530,9 @@ namespace Kokkos { namespace Impl { template -inline bool is_zero_byte(const T& t) { - using comparison_type = std::conditional_t< - sizeof(T) % sizeof(long long int) == 0, long long int, - std::conditional_t< - sizeof(T) % sizeof(long int) == 0, long int, - std::conditional_t< - sizeof(T) % sizeof(int) == 0, int, - std::conditional_t>>>; - auto bit_values = Kokkos::bit_cast< - Kokkos::Array>(t); - for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i) - if (bit_values[i] != 0) return false; - return true; +bool is_zero_byte(const T& x) { + constexpr std::byte all_zeroes[sizeof(T)] = {}; + return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } //---------------------------------------------------------------------------- From 6f176cde00c35970153aed8ec64b57b0f7163b90 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 May 2024 07:50:46 -0400 Subject: [PATCH 429/432] OpenMPTarget: Fix compiling Graph tests (#7020) * OpenMPTarget: Fix compiling Graph tests * Use team_size 32 if compiling with OpenMPTarget support * Skip launch_six for OpenMPTarget * Reenable TestOpenMPTarget_Graph --- ...Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp | 10 ++++------ .../Kokkos_OpenMPTarget_ParallelReduce_Range.hpp | 9 ++++----- .../Kokkos_OpenMPTarget_ParallelReduce_Team.hpp | 10 ++++------ .../Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 15 +++++++++------ core/unit_test/CMakeLists.txt | 1 - core/unit_test/TestGraph.hpp | 4 ++++ 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index 0782a79302a..e86a1219749 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -55,12 +55,11 @@ class ParallelReduce m_scratch_memory_lock; - public: inline void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); execute_tile( m_functor_reducer.get_functor(), m_policy, m_result_ptr, std::integral_constant()); @@ -74,8 +73,7 @@ class ParallelReduce::accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + typename ViewType::memory_space>::accessible) {} template inline std::enable_if_t execute_tile( diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index caa568a8925..4a112ed11d0 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,13 +55,13 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - // Only let one ParallelReduce instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. @@ -108,8 +108,7 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + m_result_ptr_num_elems(arg_result_view.size()) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 8abffa47a43..16c0eedb818 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,12 +470,11 @@ class ParallelReduce m_scratch_memory_lock; - public: void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); const FunctorType& functor = m_functor_reducer.get_functor(); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -521,8 +520,7 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index c886c397966..b0d69328024 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -48,10 +48,6 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one ParallelScan instance at a time use the scratch memory. - // The constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_memory_lock; - template std::enable_if_t::value> call_with_tag( const FunctorType& f, const idx_type& idx, value_type& val, @@ -181,6 +177,10 @@ class ParallelScan, const idx_type chunk_size = 128; const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View @@ -201,8 +201,7 @@ class ParallelScan, : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} //---------------------------------------- }; @@ -230,6 +229,10 @@ class ParallelScanWithTotal, const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; if (N > 0) { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock scratch_memory_lock( + OpenMPTargetExec::m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 413d4ef1c58..5f325ed4c12 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -383,7 +383,6 @@ endforeach() # Disable non-compiling tests based on clang version. if(Kokkos_ENABLE_OPENMPTARGET) list(REMOVE_ITEM OpenMPTarget_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Graph.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp diff --git a/core/unit_test/TestGraph.hpp b/core/unit_test/TestGraph.hpp index 45c86e50d39..cefcda8e061 100644 --- a/core/unit_test/TestGraph.hpp +++ b/core/unit_test/TestGraph.hpp @@ -113,6 +113,10 @@ TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_one_rvalue) { } TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET team_size incompatible + if (std::is_same_v) + GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1"; +#endif auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); From cb27c99414fa604bd04d5efd8e684a3b4149d89c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 May 2024 08:04:18 -0400 Subject: [PATCH 430/432] SYCL: Skip launch_six Graph test --- core/unit_test/TestGraph.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/unit_test/TestGraph.hpp b/core/unit_test/TestGraph.hpp index cefcda8e061..735114d4c25 100644 --- a/core/unit_test/TestGraph.hpp +++ b/core/unit_test/TestGraph.hpp @@ -117,6 +117,11 @@ TEST_F(TEST_CATEGORY_FIXTURE(graph), launch_six) { if (std::is_same_v) GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1"; #endif +#if defined(KOKKOS_ENABLE_SYCL) // FIXME_SYCL + if (std::is_same_v) + GTEST_SKIP() << "skipping since test case is known to fail with SYCL"; +#endif + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); From c8e0a95cbbe961f95befbda8d41d30bc6fff6a40 Mon Sep 17 00:00:00 2001 From: Bruno Turcksin Date: Wed, 22 May 2024 08:31:30 -0400 Subject: [PATCH 431/432] HIP: Use builtin atomic for compare_exchange (#7000) * Use builtin atomic for compare_exchange * Add generic implementation of atomic_exchange * Remove device_atomic_exchange function that takes a compare operator --- .../desul/atomics/Compare_Exchange_HIP.hpp | 145 ++++-------------- 1 file changed, 28 insertions(+), 117 deletions(-) diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp index 8c909bacdf4..0ade34f25df 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp @@ -9,6 +9,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ #define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ +#include #include #include #include @@ -17,130 +18,40 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { namespace Impl { -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicCAS(reinterpret_cast(dest), - reinterpret_cast(compare), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} +template +struct atomic_exchange_available_hip { + constexpr static bool value = + ((sizeof(T) == 1 && alignof(T) == 1) || (sizeof(T) == 4 && alignof(T) == 4) || + (sizeof(T) == 8 && alignof(T) == 8)) && + std::is_trivially_copyable::value; +}; -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - return return_val; -} - -template -__device__ std::enable_if_t +template +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; + T* const dest, T compare, T value, MemoryOrder, MemoryScope) { + (void)__hip_atomic_compare_exchange_strong( + dest, + &compare, + value, + HIPMemoryOrder::value, + HIPMemoryOrder>::value, + HIPMemoryScope::value); + return compare; } -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicExch(reinterpret_cast(dest), - reinterpret_cast(value)); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderAcqRel, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t device_atomic_exchange( - T* const dest, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast(return_val); -} - -template -__device__ std::enable_if_t -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); +template +__device__ std::enable_if_t::value, T> +device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope) { + T return_val = __hip_atomic_exchange(dest, + value, + HIPMemoryOrder::value, + HIPMemoryScope::value); return return_val; } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_compare_exchange( T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front @@ -169,7 +80,7 @@ device_atomic_compare_exchange( } template -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t::value, T> device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front T return_val; From a5bb0d41bb2af2597533ad0aa8994bdaf770bef6 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 21 Feb 2024 14:17:15 -0700 Subject: [PATCH 432/432] Fix Kokkos README's FENL link --- example/README | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/README b/example/README index 66860512448..2fe87276484 100644 --- a/example/README +++ b/example/README @@ -1,7 +1,7 @@ This directory contains example application proxies that use different parts of Kokkos. If you are looking for the FENL ("finite element -nonlinear" solve) example, it has moved into the LinAlg subpackage of -Tpetra. +nonlinear" solve) example, it has moved into the TrilinosCouplings +package in Trilinos. MANIFEST: