diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index fcd755f471..16f1466dd3 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -49,7 +49,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTilePartitioner; + using TilePartitioner = ck_tile::GemmTile2DPartitioner; using GemmEpilogue = std::conditional_t< CShuffleEpilogue, @@ -61,8 +61,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& kOutputRank, 1, 0, - TilePartitioner::kM, - TilePartitioner::kN>>, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock>>, ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem>>; diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index 5c232cfb0e..bff243d559 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -56,7 +56,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& ck_tile::TileGemmShape, ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTilePartitioner; + using TilePartitioner = ck_tile::GemmTile2DPartitioner; using GemmEpilogue = ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem>; diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp index b9c9eaa583..5cb2aa5045 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -51,7 +51,7 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTilePartitioner; + using TilePartitioner = ck_tile::GemmTile2DPartitioner; using GemmEpilogue = std::conditional_t< CShuffleEpilogue, @@ -63,8 +63,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre kOutputRank, 1, 0, - TilePartitioner::kM, - TilePartitioner::kN>>, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock>>, ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem>>; diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp index 14f3b4a5b8..6b51f696a3 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -15,7 +15,6 @@ #include "ck_tile/ops/gemm.hpp" #include "ck_tile/host.hpp" #include "grouped_gemm.hpp" -#include "utils.hpp" namespace { @@ -102,7 +101,7 @@ using Kernel = ck_tile::GroupedGemmKernel>; }; // namespace -std::size_t GetWorkspaceSize(const std::vector& gemm_descs) +std::size_t get_workspace_size(const std::vector& gemm_descs) { return ::Kernel::GetWorkSpaceSize(gemm_descs); } diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp index 20ba740884..3e5210b96c 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp @@ -52,8 +52,8 @@ auto create_args(int argc, char* argv[]) return std::make_tuple(result, arg_parser); } -std::size_t GetWorkspaceSize(const std::vector& gemm_descs); +std::size_t get_workspace_size(const std::vector& gemm_descs); -float grouped_gemm_calc(const std::vector& gemm_descs, - const ck_tile::stream_config& s, - void* p_workspace_); +float grouped_gemm(const std::vector& gemm_descs, + const ck_tile::stream_config& s, + void* p_workspace_); diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc index e889a85bf4..bd7ce38007 100644 --- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc @@ -31,7 +31,7 @@ float invoke_gemm(int n_warmup, { ck_tile::DeviceMem gemm_workspace; - gemm_workspace.Realloc(GetWorkspaceSize(args)); + gemm_workspace.Realloc(get_workspace_size(args)); float ave_time = grouped_gemm( args, @@ -128,16 +128,16 @@ int run_grouped_gemm_example_with_layouts(int argc, const ck_tile::index_t N = Ns[i]; const ck_tile::index_t K = Ks[i]; - stride_As[i] = f_get_default_stride(M, N, stride_As[i], a_layout); - stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], b_layout); - stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{}); + stride_As[i] = ck_tile::get_default_stride(M, N, stride_As[i], a_layout); + stride_Bs[i] = ck_tile::get_default_stride(K, N, stride_Bs[i], b_layout); + stride_Cs[i] = ck_tile::get_default_stride(M, N, stride_Cs[i], CLayout{}); - a_m_k_tensors.push_back( - ck_tile::HostTensor(f_host_tensor_descriptor(M, K, stride_As[i], a_layout))); - b_k_n_tensors.push_back( - ck_tile::HostTensor(f_host_tensor_descriptor(K, N, stride_Bs[i], b_layout))); + a_m_k_tensors.push_back(ck_tile::HostTensor( + ck_tile::host_tensor_descriptor(M, K, stride_As[i], a_layout))); + b_k_n_tensors.push_back(ck_tile::HostTensor( + ck_tile::host_tensor_descriptor(K, N, stride_Bs[i], b_layout))); c_m_n_tensors.push_back(ck_tile::HostTensor( - f_host_tensor_descriptor(M, N, stride_Cs[i], CLayout{}))); + ck_tile::host_tensor_descriptor(M, N, stride_Cs[i], CLayout{}))); std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc @@ -178,7 +178,7 @@ int run_grouped_gemm_example_with_layouts(int argc, for(int i = 0; i < group_count; ++i) { ck_tile::HostTensor c_m_n_host_ref( - f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); + ck_tile::host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); c_m_n_host_ref.SetZero(); ck_tile::reference_gemm( a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref); diff --git a/example/ck_tile/17_grouped_gemm/utils.hpp b/example/ck_tile/17_grouped_gemm/utils.hpp deleted file mode 100644 index bb3cdf9fdc..0000000000 --- a/example/ck_tile/17_grouped_gemm/utils.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -template -constexpr auto -f_host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) -{ - using namespace ck_tile::literals; - - if constexpr(std::is_same_v) - { - return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); - } - else - { - return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride}); - } -} -template -constexpr auto -f_get_default_stride(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) -{ - if(stride == 0) - { - if constexpr(std::is_same_v) - { - return col; - } - else - { - return row; - } - } - else - return stride; -} diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 02ce449912..34f8ec5245 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -54,7 +54,6 @@ #include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/tensor/tile_window_utils.hpp" #include "ck_tile/core/tensor/update_tile.hpp" -#include "ck_tile/core/utility/amd_address_space.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/functional_with_tuple.hpp" diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index afcf982a63..09de5f325f 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -12,18 +12,37 @@ namespace ck_tile { -enum struct address_space_enum +template +struct safe_underlying_type; + +template +struct safe_underlying_type +{ + using type = std::underlying_type_t; +}; + +template +struct safe_underlying_type +{ + using type = void; +}; + +template +using safe_underlying_type_t = typename safe_underlying_type::value>::type; + +enum struct address_space_enum : std::uint16_t { - generic, + generic = 0, global, lds, sgpr, - vgpr, + constant, + vgpr }; -enum struct memory_operation_enum +enum struct memory_operation_enum : std::uint16_t { - set, + set = 0, atomic_add, atomic_max, add @@ -109,4 +128,30 @@ CK_TILE_DEVICE void s_nop(index_t cnt = 0) #endif } +#define CK_CONSTANT_ADDRESS_SPACE \ + __attribute__((address_space( \ + static_cast>(address_space_enum::constant)))) + +template +__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p) +{ + // cast a pointer in "Constant" address space (4) to "Generic" address space (0) + // only c-style pointer cast seems be able to be compiled +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + return (T*)(p); // NOLINT(old-style-cast) +#pragma clang diagnostic pop +} + +template +__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p) +{ + // cast a pointer in "Generic" address space (0) to "Constant" address space (4) + // only c-style pointer cast seems be able to be compiled; +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast) +#pragma clang diagnostic pop +} + } // namespace ck_tile diff --git a/include/ck_tile/core/utility/amd_address_space.hpp b/include/ck_tile/core/utility/amd_address_space.hpp deleted file mode 100644 index cb242bf0d5..0000000000 --- a/include/ck_tile/core/utility/amd_address_space.hpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core/config.hpp" - -// Address Space for AMDGCN -// https://llvm.org/docs/AMDGPUUsage.html#address-space - -namespace ck_tile { - -#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4))) - -template -__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p) -{ - // cast a pointer in "Constant" address space (4) to "Generic" address space (0) - // only c-style pointer cast seems be able to be compiled -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - return (T*)p; // NOLINT(old-style-cast) -#pragma clang diagnostic pop -} - -template -__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p) -{ - // cast a pointer in "Generic" address space (0) to "Constant" address space (4) - // only c-style pointer cast seems be able to be compiled -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast) -#pragma clang diagnostic pop -} - -} // namespace ck_tile diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index 3902cad178..2babb2afe9 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -678,4 +678,37 @@ struct HostTensor Descriptor mDesc; Data mData; }; + +template +auto host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) +{ + using namespace ck_tile::literals; + + if constexpr(std::is_same_v) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } +} +template +auto get_default_stride(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) +{ + if(stride == 0) + { + if constexpr(std::is_same_v) + { + return col; + } + else + { + return row; + } + } + else + return stride; +} + } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp index eaf66237af..4b4a4d7a09 100644 --- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -101,9 +101,12 @@ struct BatchedGemmKernel : public GemmKernel) { - if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false) + if(kargs.K % TilePartitioner::KPerBlock != 0 && GemmPipeline::kPadK == false) { return false; } @@ -185,7 +185,7 @@ struct GemmKernel } else { - if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false) + if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false) { return false; } @@ -197,7 +197,7 @@ struct GemmKernel if constexpr(std::is_same_v) { - if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false) + if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false) { return false; } @@ -208,7 +208,7 @@ struct GemmKernel } else { - if(kargs.K % TilePartitioner::kK != 0 && GemmPipeline::kPadK == false) + if(kargs.K % TilePartitioner::KPerBlock != 0 && GemmPipeline::kPadK == false) { return false; } @@ -220,7 +220,7 @@ struct GemmKernel if constexpr(std::is_same_v) { - if(kargs.N % TilePartitioner::kN != 0 && GemmPipeline::kPadN == false) + if(kargs.N % TilePartitioner::NPerBlock != 0 && GemmPipeline::kPadN == false) { return false; } @@ -231,7 +231,7 @@ struct GemmKernel } else { - if(kargs.M % TilePartitioner::kM != 0 && GemmPipeline::kPadM == false) + if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false) { return false; } @@ -323,17 +323,17 @@ struct GemmKernel const auto& a_tensor_view = views.at(I0); if constexpr(std::is_same_v) { - return pad_tensor_view( - a_tensor_view, - make_tuple(number{}, number{}), - sequence{}); + return pad_tensor_view(a_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); } else { - return pad_tensor_view( - a_tensor_view, - make_tuple(number{}, number{}), - sequence{}); + return pad_tensor_view(a_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); } }(); @@ -341,17 +341,17 @@ struct GemmKernel const auto& b_tensor_view = views.at(I1); if constexpr(std::is_same_v) { - return pad_tensor_view( - b_tensor_view, - make_tuple(number{}, number{}), - sequence{}); + return pad_tensor_view(b_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); } else { - return pad_tensor_view( - b_tensor_view, - make_tuple(number{}, number{}), - sequence{}); + return pad_tensor_view(b_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); } }(); @@ -359,17 +359,17 @@ struct GemmKernel const auto& c_tensor_view = views.at(I2); if constexpr(std::is_same_v) { - return pad_tensor_view( - c_tensor_view, - make_tuple(number{}, number{}), - sequence{}); + return pad_tensor_view(c_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); } else { - return pad_tensor_view( - c_tensor_view, - make_tuple(number{}, number{}), - sequence{}); + return pad_tensor_view(c_tensor_view, + make_tuple(number{}, + number{}), + sequence{}); } }(); @@ -383,19 +383,19 @@ struct GemmKernel const auto& a_pad_view = views.at(I0); const auto& a_block_window = make_tile_window( a_pad_view, - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), {i_m, 0}); const auto& b_pad_view = views.at(I1); const auto& b_block_window = make_tile_window( b_pad_view, - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), {i_n, 0}); const auto& c_pad_view = views.at(I2); auto c_block_window = make_tile_window( c_pad_view, - make_tuple(number{}, number{}), + make_tuple(number{}, number{}), {i_m, i_n}); return make_tuple(a_block_window, b_block_window, c_block_window); @@ -426,7 +426,7 @@ struct GemmKernel // Create Gemm tensor views, pad views and tile windows const auto& gemm_tensor_views_tuple = MakeGemmTensorViews(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset); - ; + const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); @@ -456,7 +456,10 @@ struct GemmKernel CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const { - const auto [i_m, i_n] = TilePartitioner{}(); + const auto [iM, iN] = TilePartitioner::GetOutputTileIndex(blockIdx.x, blockIdx.y); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const SplitKBatchOffset splitk_batch_offset(kargs); // options const ADataType* a_ptr = diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index 8ffe681f90..eb2b817db6 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -1,73 +1,160 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "ck_tile/core.hpp" namespace ck_tile { -template -struct GemmTilePartitioner + +/** @brief Struct representing 2D block index mapping into 3D output tile space. */ +template +struct GemmTile2DPartitioner { - using BlockGemmShape = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; - static constexpr index_t kM = BlockGemmShape::kM; - static constexpr index_t kN = BlockGemmShape::kN; - static constexpr index_t kK = BlockGemmShape::kK; + static constexpr index_t MPerBlock = BlockGemmShape::kM; + static constexpr index_t NPerBlock = BlockGemmShape::kN; + static constexpr index_t KPerBlock = BlockGemmShape::kK; - CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t batch_size) + /** @brief Returns 3D grid size. */ + CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t batch_size) noexcept( + noexcept(MPerBlock != 0 && NPerBlock != 0)) -> dim3 { - index_t GridDimX = (M + kM - 1) / kM; - index_t GridDimY = (N + kN - 1) / kN; - index_t GridDimZ = batch_size; + const index_t GridDimX = (M + MPerBlock - 1) / MPerBlock; + const index_t GridDimY = (N + NPerBlock - 1) / NPerBlock; + const index_t GridDimZ = batch_size; return dim3(GridDimX, GridDimY, GridDimZ); } - CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) + /** + * @brief Returns the number of loops. + * @param [in] K is dimension + */ + CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) noexcept -> index_t { - return integer_divide_ceil(K, kK); + return integer_divide_ceil(K, KPerBlock); } - CK_TILE_DEVICE auto operator()() + /** + * @brief The function returns 2D output tile space. + * @param [in] blockIdx is blockIdx.x + * @param [in] blockIdy is blockIdx.y + * @return Returns the output tile indexes. + */ + CK_TILE_DEVICE static constexpr auto GetOutputTileIndex(index_t blockIdx, + index_t blockIdy) noexcept + -> const tuple { - const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kM); - const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kN); + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx); + const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy); return make_tuple(iM, iN); } }; -template +/** + * @brief Struct representing 1D block index mapping into 2D output tile space. + */ +template struct GemmTile1DPartitioner { - using BlockGemmShape = remove_cvref_t; + using BlockGemmShape = remove_cvref_t; static constexpr index_t MPerBlock = BlockGemmShape::kM; static constexpr index_t NPerBlock = BlockGemmShape::kN; static constexpr index_t KPerBlock = BlockGemmShape::kK; - CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N) + /** @brief delete default ctr with no any object */ + constexpr GemmTile1DPartitioner() noexcept = delete; + + /** @brief constructs an object that does contain a N value. */ + constexpr GemmTile1DPartitioner(index_t N) noexcept { N_ = N; } + + /** @brief Returns 1D grid size. */ + CK_TILE_HOST static constexpr auto + GridSize(index_t M, index_t N) noexcept(noexcept(MPerBlock != 0 && NPerBlock != 0)) -> dim3 { - index_t GridDimX = (M + MPerBlock - 1) / MPerBlock; - index_t GridDimY = (N + NPerBlock - 1) / NPerBlock; + const index_t GridDimX = (M + MPerBlock - 1) / MPerBlock; + const index_t GridDimY = (N + NPerBlock - 1) / NPerBlock; return dim3(GridDimX * GridDimY, 1, 1); } - CK_TILE_HOST_DEVICE static constexpr auto GetNBlock(index_t N) + /** + * @brief Returns the number of blocks in N. + * @param [in] N is dimension + */ + CK_TILE_HOST_DEVICE static constexpr auto GetNBlock(index_t N) noexcept -> index_t { return integer_divide_ceil(N, NPerBlock); } - CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) + /** + * @brief Returns the number of loops. + * @param [in] K is dimension + */ + CK_TILE_HOST_DEVICE static constexpr auto GetLoopNum(index_t K) noexcept -> index_t { return integer_divide_ceil(K, KPerBlock); } - CK_TILE_DEVICE auto operator()(index_t blockOffset, index_t NBlockSize) + /** + * @brief The function returns 2D output tile space. + * @param [in] blockIdx is blockIdx.x - block_start. + * */ + CK_TILE_DEVICE static constexpr auto GetOutputTileIndex(index_t blockIdx) noexcept + -> const tuple + { + const index_t NBlock = GetNBlock(N_); + + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlock); + const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - (iM)*NBlock); + return make_tuple(iM, iN); + } + + private: + CK_TILE_DEVICE static index_t N_; +}; + +/** + * @brief `GemmTile1DPartitioner::GetOutputTileIndex`'s std::false specialization, + * checking expression validity in-place for ill-formed. + */ +template +struct HasFnOneArgImpl : std::false_type +{ +}; + +/** + * @brief `GemmTile1DPartitioner::GetOutputTileIndex`'s std::true specialization, + * checking expression validity in-place for well-formed. + * @note: `1` - a constant value indicating the number of parameters in the function. + */ +template +struct HasFnOneArgImpl().GetOutputTileIndex(1))>> + : std::true_type +{ +}; + +/** + * @brief Struct used to calculate offseted tile indexes. + * @note: The struct supports the 1D-Partitioner mechanism, + * enable-if `GetOutputTileIndex`-fn is std::true_type when `GetOutputTileIndex`-fn is well-formed, + * otherwise std::false_type. + */ +template {}>> +struct OffsettedTile1DPartitioner +{ + /** + * @brief The function subtracts the block's start (offset) from 1D raw-indexes. + * @param [in] block_start is `blockIdx.x - block_start`. + * @return Returns a `tuple` [Im, In] shifted index, used to shift 1d-tile index. + */ + [[nodiscard]] CK_TILE_DEVICE static constexpr auto GetOffsetedTileIndex(index_t block_start, + index_t N) noexcept + -> const tuple { - index_t iM = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) / - GetNBlock(NBlockSize) * MPerBlock); - index_t iN = __builtin_amdgcn_readfirstlane((blockIdx.x - blockOffset) % - GetNBlock(NBlockSize) * NPerBlock); + const auto [iM, iN] = PartitionerFn(N).GetOutputTileIndex(blockIdx.x - block_start); return make_tuple(iM, iN); } }; diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index f24fc47afc..6dbb1d6b82 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -1,72 +1,79 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include -#include - #include "ck_tile/core/numeric/math.hpp" #include "ck_tile/core/utility/literals.hpp" -#include "ck_tile/core/utility/amd_address_space.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" -#include "ck_tile/core.hpp" -#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp" #include "ck_tile/host.hpp" namespace ck_tile { -struct GroupedGemmHostArgs +struct GroupedGemmHostArgs : public ck_tile::GemmHostArgs { - const void* a_ptr; - const void* b_ptr; - void* c_ptr; - index_t M; - index_t N; - index_t K; - index_t stride_A; - index_t stride_B; - index_t stride_C; + CK_TILE_HOST GroupedGemmHostArgs() noexcept = default; + CK_TILE_HOST GroupedGemmHostArgs(const void* a_ptr_, + const void* b_ptr_, + void* c_ptr_, + ck_tile::index_t M_, + ck_tile::index_t N_, + ck_tile::index_t K_, + ck_tile::index_t stride_A_, + ck_tile::index_t stride_B_, + ck_tile::index_t stride_C_) + : GemmHostArgs(a_ptr_, b_ptr_, c_ptr_, KBatch, M_, N_, K_, stride_A_, stride_B_, stride_C_) + { + } + + private: + static constexpr index_t KBatch = 1; }; template -struct GroupedGemmKernel +struct GroupedGemmKernel : public GemmKernel { - using TilePartitioner = remove_cvref_t; - using GemmPipeline = remove_cvref_t; - using EpiloguePipeline = remove_cvref_t; - using ALayout = remove_cvref_t; - using BLayout = remove_cvref_t; - using CLayout = remove_cvref_t; - static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + using TilePartitioner = remove_cvref_t; + using GemmPipeline = remove_cvref_t; + using EpiloguePipeline = remove_cvref_t; + using ALayout = remove_cvref_t; + using BLayout = remove_cvref_t; + using CLayout = remove_cvref_t; using ADataType = remove_cvref_t; using BDataType = remove_cvref_t; using CDataType = remove_cvref_t; + using OffsetTile1DPartitioner = OffsettedTile1DPartitioner; + using Base = GemmKernel; + using GemmKernelArgs = typename Base::GemmKernelArgs; + + static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize; + static constexpr index_t KBatch = 1; + struct GemmTransKernelArg { - GroupedGemmHostArgs group_karg; + GemmKernelArgs group_karg; ck_tile::index_t block_start; ck_tile::index_t block_end; GemmTransKernelArg() = default; - GemmTransKernelArg(GroupedGemmHostArgs&& karg, index_t bl_start, index_t bl_end) + GemmTransKernelArg(GemmKernelArgs&& karg, index_t bl_start, index_t bl_end) : group_karg{karg}, block_start{bl_start}, block_end{bl_end} { } }; - __host__ static size_t GetWorkSpaceSize(const std::vector& gemm_descs) + __host__ static auto GetWorkSpaceSize(const std::vector& gemm_descs) + -> std::size_t { return gemm_descs.size() * sizeof(GemmTransKernelArg); } - __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); } - - using Hargs = GroupedGemmHostArgs; + __host__ static constexpr auto BlockSize() -> dim3 { return dim3(KernelBlockSize); } - __host__ static constexpr auto GridSize(const std::vector& gemm_descs) + __host__ static constexpr auto GridSize(const std::vector& gemm_descs) { index_t grid_size = 0; for(const auto& it_desc : gemm_descs) @@ -77,7 +84,8 @@ struct GroupedGemmKernel return dim3(grid_size, 1, 1); } - CK_TILE_HOST static auto MakeKargs(const std::vector& gemm_descs) + CK_TILE_HOST static auto MakeKargs(const std::vector& gemm_descs) + -> std::vector { std::vector gemm_kernel_args_; index_t group_count = ck_tile::type_convert(gemm_descs.size()); @@ -100,22 +108,23 @@ struct GroupedGemmKernel const index_t stride_c = gemm_descs[i].stride_C; const auto dim3 = TilePartitioner::GridSize(M, N); - const index_t grid_size_grp = dim3.x * 1 * 1; + const index_t grid_size_grp = dim3.x; const index_t block_start = grid_size; const index_t block_end = grid_size + grid_size_grp; grid_size += grid_size_grp; - auto karg = GroupedGemmHostArgs{type_convert(gemm_descs[i].a_ptr), - type_convert(gemm_descs[i].b_ptr), - type_convert(gemm_descs[i].c_ptr), - M, - N, - K, - stride_a, - stride_b, - stride_c}; + auto karg = GemmKernelArgs{type_convert(gemm_descs[i].a_ptr), + type_convert(gemm_descs[i].b_ptr), + type_convert(gemm_descs[i].c_ptr), + M, + N, + K, + stride_a, + stride_b, + stride_c, + KBatch}; gemm_kernel_args_.emplace_back(std::move(karg), block_start, block_end); } @@ -123,162 +132,34 @@ struct GroupedGemmKernel return gemm_kernel_args_; } - CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() + CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize() -> index_t { return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize()); } - CK_TILE_DEVICE void Run(const Hargs& kargs, const index_t block_start) const + CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs) const { - const auto [i_m, i_n] = TilePartitioner{}(block_start, kargs.N); - // options - const ADataType* a_start = static_cast(kargs.a_ptr); - const BDataType* b_start = static_cast(kargs.b_ptr); - // Convert pointers to tensor views - auto a_tensor_view = [&]() { - if constexpr(std::is_same_v) - { - return make_naive_tensor_view( - a_start, - make_tuple(kargs.M, kargs.K), - make_tuple(kargs.stride_A, 1), - number{}, - number<1>{}); - } - else - { - return make_naive_tensor_view( - a_start, - make_tuple(kargs.M, kargs.K), - make_tuple(1, kargs.stride_A), - number<1>{}, - number<1>{}); - } - }(); + const auto [iM, iN] = + OffsetTile1DPartitioner::GetOffsetedTileIndex(kargs.block_start, kargs.group_karg.N); - auto b_tensor_view = [&]() { - if constexpr(std::is_same_v) - { - return make_naive_tensor_view( - b_start, - make_tuple(kargs.N, kargs.K), - make_tuple(1, kargs.stride_B), - number<1>{}, - number<1>{}); - } - else - { - return make_naive_tensor_view( - b_start, - make_tuple(kargs.N, kargs.K), - make_tuple(kargs.stride_B, 1), - number{}, - number<1>{}); - } - }(); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); - auto a_pad_view = [&]() { - if constexpr(std::is_same_v) - { - return pad_tensor_view(a_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - } - else - { - return pad_tensor_view(a_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - } - }(); - // clang-format on + const typename Base::SplitKBatchOffset splitk_batch_offset(kargs.group_karg, blockIdx.z); - auto a_block_window = make_tile_window( - a_pad_view, - make_tuple(number{}, number{}), - {i_m, 0}); - - auto b_pad_view = [&]() { - if constexpr(std::is_same_v) - { - return pad_tensor_view(b_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - } - else - { - return pad_tensor_view(b_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - } - }(); - - auto b_block_window = make_tile_window( - b_pad_view, - make_tuple(number{}, number{}), - {i_n, 0}); + const ADataType* a_ptr = static_cast(kargs.group_karg.a_ptr); + const BDataType* b_ptr = static_cast(kargs.group_karg.b_ptr); + CDataType* c_ptr = static_cast(kargs.group_karg.c_ptr); // allocate LDS __shared__ char smem_ptr[GetSmemSize()]; - const index_t num_loop = TilePartitioner::GetLoopNum(kargs.K); - - // Run GEMM cooperatively by whole wokrgroup. - auto c_block_tile = - GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr); - - CDataType* c_start = static_cast(kargs.c_ptr); - auto c_tensor_view = [&]() { - if constexpr(std::is_same_v) - { - return make_naive_tensor_view( - c_start, - make_tuple(kargs.M, kargs.N), - make_tuple(kargs.stride_C, 1), - number{}, - number<1>{}); - } - else - { - return make_naive_tensor_view( - c_start, - make_tuple(kargs.M, kargs.N), - make_tuple(1, kargs.stride_C), - number<1>{}, - number<1>{}); - } - }(); - - auto c_pad_view = [&]() { - if constexpr(std::is_same_v) - { - return pad_tensor_view(c_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - } - else - { - return pad_tensor_view(c_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - } - }(); - auto CBlockWindow_pad = make_tile_window( - c_pad_view, - make_tuple(number{}, number{}), - {i_m, i_n}); - - EpiloguePipeline{}(CBlockWindow_pad, c_block_tile); + this->RunGemm( + a_ptr, b_ptr, c_ptr, smem_ptr, kargs.group_karg, splitk_batch_offset, i_m, i_n); } CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const, - int group_count) const + index_t group_count) const { const index_t block_id = ck_tile::get_block_1d_id(); const auto gemm_desc_ptr = reinterpret_cast( @@ -286,7 +167,7 @@ struct GroupedGemmKernel index_t left = 0; index_t right = group_count; - index_t group_id = index_t((left + right) / 2); + index_t group_id = index_t((left + right) >> 1); while((!(block_id >= gemm_desc_ptr[group_id].block_start && block_id < gemm_desc_ptr[group_id].block_end)) && @@ -300,10 +181,10 @@ struct GroupedGemmKernel { left = group_id; } - group_id = index_t((left + right) / 2); + group_id = index_t((left + right) >> 1); } - Run(gemm_desc_ptr[group_id].group_karg, gemm_desc_ptr[group_id].block_start); + Run(gemm_desc_ptr[group_id]); } }; diff --git a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp index e7e9b3d679..ab534ffcfa 100644 --- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp +++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include @@ -61,7 +61,7 @@ class TestCkTileBatchedGemm : public ::testing::Test ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTilePartitioner; + using TilePartitioner = ck_tile::GemmTile2DPartitioner; using GemmEpilogue = std::conditional_t< CShuffleEpilogue, @@ -73,8 +73,8 @@ class TestCkTileBatchedGemm : public ::testing::Test kOutputRank, 1, 0, - TilePartitioner::kM, - TilePartitioner::kN>>, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock>>, ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem>>; diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp index 4b0e40060d..96199f33e8 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp @@ -59,7 +59,7 @@ class TestCkTileGemmPipeline : public ::testing::Test ck_tile::TileGemmShape, ck_tile::sequence, ck_tile::sequence>; - using TilePartitioner = ck_tile::GemmTilePartitioner; + using TilePartitioner = ck_tile::GemmTile2DPartitioner; using GemmEpilogue = ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem>;