Skip to content

Commit

Permalink
Fixed issues encountered through oneMKL portBLAS backend (codeplaysof…
Browse files Browse the repository at this point in the history
…tware#504)

* minor fixes and reverting of ACPP changes causing unexpected tests & header only lib behaviors
* fixes to gemm half support with default cpu
* Add check for managed usm allocation for AMD
* Added clarifications regarding half support

Co-authored-by: nscipione <[email protected]>
Co-authored-by: HJA Bird <[email protected]>
Co-authored-by: pgorlani <[email protected]>
  • Loading branch information
4 people authored Apr 11, 2024
1 parent 5783414 commit 11e8b0b
Show file tree
Hide file tree
Showing 15 changed files with 327 additions and 135 deletions.
6 changes: 0 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,6 @@ option(BLAS_ENABLE_EXTENSIONS "Whether to enable portBLAS extensions" ON)
option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for GEMM" OFF)
option(BLAS_ENABLE_HALF "Whether to enable sycl::half data type for supported operators" OFF)

if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
OR (INSTALL_HEADER_ONLY AND (NOT TUNING_TARGET)))
set(BLAS_ENABLE_HALF OFF)
message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
endif()

if (SYCL_COMPILER MATCHES "adaptivecpp")
if(BLAS_ENABLE_COMPLEX)
message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,10 +491,10 @@ Some of the supported options are:
| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
| `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
| `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) |
| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |
| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float`. Enabling other types such as complex or half requires setting their respective options *(next)*. |
| `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`OFF` by default) |
| `BLAS_ENABLE_HALF` | `ON`/`OFF` | Determines whether to enable Half data type support *(Support is limited to some Level 1 operators and Gemm)* (`OFF` by default) |
| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |

## ComputeCpp Compilation *(Deprecated)*

Expand Down
9 changes: 9 additions & 0 deletions cmake/CmakeFunctionHelper.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,15 @@ else() # default cpu backend
add_gemm_configuration(
"${data}" 64 "false" "false" "false"
64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")

if(BLAS_ENABLE_HALF)
add_gemm_configuration(
"half" 128 "false" "false" "false"
64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "no_local" "standard" "full" 1 "strided" "false" "false")
add_gemm_configuration(
"half" 64 "false" "false" "false"
64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")
endif()
endforeach()

if(BLAS_ENABLE_COMPLEX)
Expand Down
12 changes: 7 additions & 5 deletions cmake/Modules/SYCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ include(CheckCXXCompilerFlag)
include(ConfigurePORTBLAS)

# find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp)
if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
message(STATUS "Using `omp` as ACPP_TARGETS")
set(ACPP_TARGETS "omp")
else()
message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
if(SYCL_COMPILER MATCHES "adaptivecpp")
if(NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
message(STATUS "Using `omp` as ACPP_TARGETS")
set(ACPP_TARGETS "omp")
else()
message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
endif()
endif()

check_cxx_compiler_flag("--acpp-targets" has_acpp)
Expand Down
40 changes: 8 additions & 32 deletions include/container/sycl_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,51 +194,27 @@ template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh,
size_t size) {
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
} else {
// Skip data initialization if not accessing in read mode only
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
cl::sycl::property::no_init{});
}
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
}

template <typename element_t>
template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh) {
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size());
} else {
// Skip data initialization if not accessing in read mode only
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size(),
cl::sycl::property::no_init{});
}
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size());
}

template <typename element_t>
template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>
BufferIterator<element_t>::get_range_accessor(size_t size) {
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));

} else {
// Skip data initialization if not accessing in read mode only
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
cl::sycl::property::no_init{});
}
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
}

template <typename element_t>
Expand Down
16 changes: 8 additions & 8 deletions include/interface/blas1_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum(
* \brief Prototype for the internal implementation of the ASUM operation. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
template <int localSize, int localMemSize, bool usmManagedMem = false,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _asum_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
Expand Down Expand Up @@ -257,9 +257,9 @@ typename sb_handle_t::event_t _nrm2(
* \brief Prototype for the internal implementation of the NRM2 operator. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
template <int localSize, int localMemSize, bool usmManagedMem = false,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _nrm2_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
Expand All @@ -269,8 +269,8 @@ typename sb_handle_t::event_t _nrm2_impl(
* \brief Prototype for the internal implementation of the Dot operator. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t,
template <int localSize, int localMemSize, bool usmManagedMem = false,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename container_2_t, typename index_t, typename increment_t>
typename sb_handle_t::event_t _dot_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
Expand Down
12 changes: 7 additions & 5 deletions include/operations/blas1_trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@ struct AssignReduction {
* function below.
*
*/
template <typename operator_t, typename lhs_t, typename rhs_t>
template <typename operator_t, bool usmManagedMem, typename lhs_t,
typename rhs_t>
struct WGAtomicReduction {
using value_t = typename lhs_t::value_t;
using index_t = typename rhs_t::index_t;
Expand Down Expand Up @@ -304,10 +305,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
lhs_, rhs_, local_num_thread_, global_num_thread_);
}

template <typename operator_t, typename lhs_t, typename rhs_t>
inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
lhs_t &lhs_, rhs_t &rhs_) {
return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
template <typename operator_t, bool usmManagedMem = false, typename lhs_t,
typename rhs_t>
inline WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>
make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
return WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>(lhs_, rhs_);
}

template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
Expand Down
10 changes: 10 additions & 0 deletions include/portblas_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value,
}
#endif

template <typename sb_handle_t, typename containerT>
inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) {
if constexpr (std::is_pointer_v<containerT>) {
return sycl::usm::alloc::shared ==
sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
} else {
return false;
}
}

} // end namespace helper
} // end namespace blas
#endif // PORTBLAS_HELPER_H
28 changes: 28 additions & 0 deletions samples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,31 @@
#/***************************************************************************
# *
# * @license
# * Copyright (C) Codeplay Software Limited
# * Licensed under the Apache License, Version 2.0 (the "License");
# * you may not use this file except in compliance with the License.
# * You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * For your convenience, a copy of the License has been included in this
# * repository.
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# *
# * portBLAS: BLAS implementation using SYCL
# *
# * @filename CMakeLists.txt
# *
# **************************************************************************/
cmake_minimum_required(VERSION 3.4.3)

project(portBLASSample LANGUAGES CXX)

set(PORTBLAS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../include)
set(PORTBLAS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../src)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
Expand Down
133 changes: 106 additions & 27 deletions src/interface/blas1/backend/amd_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
#define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
#include "interface/blas1_interface.h"
#include "portblas_helper.h"

namespace blas {
namespace asum {
Expand All @@ -34,16 +35,42 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename sb_handle_t::event_t _asum(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
/**
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
if (usm_managed_mem) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
}
}
} // namespace backend
Expand Down Expand Up @@ -101,16 +128,42 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename sb_handle_t::event_t _nrm2(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
/**
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
if (usm_managed_mem) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_nrm2_impl<localSize, 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_nrm2_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_nrm2_impl<localSize, 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
}
}
} // namespace backend
Expand All @@ -124,16 +177,42 @@ typename sb_handle_t::event_t _dot(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _vy, increment_t _incy, container_2_t _rs,
const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_dot_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
/**
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
if (usm_managed_mem) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_dot_impl<static_cast<int>(localSize), 32, true>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_dot_impl<localSize, 32, true>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
}
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_dot_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_dot_impl<static_cast<int>(localSize), 32, false>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_dot_impl<localSize, 32, false>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
}
}
}
} // namespace backend
Expand Down
Loading

0 comments on commit 11e8b0b

Please sign in to comment.