Skip to content

Commit

Permalink
Transpose profiler fix (#1114)
Browse files Browse the repository at this point in the history
* added working example for 5D input using 1D kernel

* example with 5D input tensor and 2d kernel - not working: issues with arguments

* added updated version of 3d device op - changed descriptors/dims

* added example file to check kernel

* fixed descriptor and isSupportedArgument stride problem

* added and modified kernel for 3d - updated tids/loop

* adding some more 5d example files

* fixed some issues

* changes made for testing

* working version: fixed error in stride for A, still a bit inefficient

* cleaned up formatting/comments

* updating formatting

* more formatting fixes

* fixing cmake, adding back gpu targets in cmake script

* adding client example

* added instances for client example

* fixed errors in client example

* implemented client ex with device_elementwise.hpp and device_elementwise_3d_impl.hpp

* removed extra files

* minor formatting and naming fixes

* adding test files and profiler

* fixing minor error

* minor fix

* removed unneccesary comments, renamed files

* updated instance list for client example, added different layout example

* removing instances

* fixed error in instance generation

* remove comments

* update profiler and client example tensor layouts

* fixed errors in test/profiler

* updated vector dim access to enable vector load

* updated test/profiler files

* updated example with 1d kernel

* updating profiler

* renamed files

* disabled device op for MI300

* skip  elementwise_permute_2d on gfx94x

* Update CMakeLists.txt

* fixing CMake - disabling some GPU targets

* added transpose profiler to CMake

* fixed transpose profiler errors

* fixed instances for tests/profiler

* cleaned up code in transpose profiler source code

* added some comments, updated copyright

* made function arguments const where possible

---------

Co-authored-by: Jing Zhang <[email protected]>
Co-authored-by: Jing Zhang <[email protected]>
Co-authored-by: zjing14 <[email protected]>
  • Loading branch information
4 people authored Jan 4, 2024
1 parent fbf31a2 commit aa3e2d7
Show file tree
Hide file tree
Showing 8 changed files with 151 additions and 116 deletions.
12 changes: 6 additions & 6 deletions example/44_elementwise_permute/elementwise_permute_3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
using F16 = ck::half_t;
using F32 = float;

using ADataType = F16;
using BDataType = F16;
using ADataType = F32;
using BDataType = F32;

using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
Expand All @@ -25,10 +25,10 @@ using DeviceElementwisePermuteInstance =
2, // NumDim_m, {N, C}
2, // NumDim_n, {H, W}
1, // NumDim_k, {D}
8, // MPerThread
8, // NPerThread
8, // KPerThread
ck::Sequence<8>, // InScalarPerVectorSeq
4, // MPerThread
4, // NPerThread
4, // KPerThread
ck::Sequence<4>, // InScalarPerVectorSeq
ck::Sequence<4>>; // OutScalarPerVectorSeq

template <typename HostTensorA, typename HostTensorB, typename Functor>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,19 @@ template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

using device_transpose_f16_instances = std::tuple<
// FOR 16, 32, 16, 32, 16
// clang-format off
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 1, 1, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 4, 4, ck::Sequence<1>, ck::Sequence<1>>
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<8>>,
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 8, 8, 8, ck::Sequence<8>, ck::Sequence<4>>,
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 4, 4, 8, ck::Sequence<4>, ck::Sequence<4>>,
DeviceElementwise3dImpl<ck::Tuple<F16>, ck::Tuple<F16>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>
// clang-format on
>;

using device_transpose_f32_instances = std::tuple<
// for 16, 8, 16, 32, 8 -> test with instances for fp16
// clang-format off
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 4, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 8, 8, ck::Sequence<1>, ck::Sequence<1>>
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F32>, ck::Tuple<F32>, PassThrough, 2, 2, 1, 4, 4, 4, ck::Sequence<4>, ck::Sequence<4>>
// clang-format on
>;

Expand Down
12 changes: 4 additions & 8 deletions profiler/include/profiler/profile_transpose_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace ck {
namespace profiler {

template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor)
{
for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n)
for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c)
Expand All @@ -34,7 +34,7 @@ void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functo
for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w)
{
auto a_val = A_ncdhw(n, c, d, h, w);
functor(B_nchwd(n, c, h, w, d), a_val);
functor(B_ndhwc(n, d, h, w, c), a_val);
}
}

Expand Down Expand Up @@ -77,8 +77,6 @@ bool profile_transpose_impl(int do_verification,

using ElementOp = ck::tensor_operation::element_wise::PassThrough;

// const auto element_op = ElementOp{};

DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());

Expand Down Expand Up @@ -118,6 +116,7 @@ bool profile_transpose_impl(int do_verification,
// re-init C to zero before profiling next kernel
b_device_buf.SetZero();

// run for verification
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});

if(do_verification)
Expand All @@ -136,6 +135,7 @@ bool profile_transpose_impl(int do_verification,

std::string op_name = op_ptr->GetTypeString();

// run for timing purposes
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

Expand All @@ -153,10 +153,6 @@ bool profile_transpose_impl(int do_verification,
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;

// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass &= ck::utils::check_err(
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);

if(tflops > best_tflops)
{
best_op_name = op_name;
Expand Down
2 changes: 2 additions & 0 deletions profiler/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set(PROFILER_SOURCES
profile_batchnorm_infer.cpp
profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
)

if(DL_KERNELS)
Expand Down Expand Up @@ -91,6 +92,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_d
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)

if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
Expand Down
112 changes: 112 additions & 0 deletions profiler/src/profile_transpose.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>

#include "profiler/profile_transpose_impl.hpp"
#include "profiler_operation_registry.hpp"

enum struct DataType
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F16_F16, // 1
};

#define OP_NAME "transpose"
#define OP_DESC "Transpose"

struct TransposeArgParser
{
std::unordered_map<std::string, std::vector<int>> long_opts = {{"lengths", {}}};

bool parse_opt(const int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
const int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}

void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};

static void print_helper_msg()
{
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: verification (0: no; 1: yes)\n");
printf("arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg5: print tensor value (0: no; 1: yes)\n");
printf("arg6: time kernel (0=no, 1=yes)\n");
printf("arg7: --lengths: N, C, D, H, W\n");
}

int profile_transpose(int argc, char* argv[])
{
if(argc != 7)
{
print_helper_msg();
exit(1);
}
TransposeArgParser arg_parser;

const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
arg_parser(argc, argv);
const std::vector<ck::index_t> lengths = arg_parser.long_opts["lengths"];

using F32 = float;
using F16 = ck::half_t;

auto profile = [&](auto a_type, auto b_type) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
constexpr ck::index_t NumDim = 5;

bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, NumDim>(
do_verification, init_method, do_log, time_kernel, lengths);

return pass ? 0 : 1;
};

if(data_type == DataType::F32_F32_F32_F32_F32)
{
return profile(F32{}, F32{});
}
else if(data_type == DataType::F16_F16_F16_F16_F16)
{
return profile(F16{}, F16{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;

return 1;
}
}

REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_transpose);
34 changes: 21 additions & 13 deletions test/transpose/test_transpose.cpp
Original file line number Diff line number Diff line change
@@ -1,27 +1,35 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

#include <tuple>

#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_transpose_util.hpp"
#include "profiler/profile_transpose_impl.hpp"

using F16 = ck::half_t;
using F32 = float;
using ck::index_t;

template <typename Tuple>
class TestTranspose : public ::testing::Test
{
protected:
using ADataType = std::tuple_element_t<0, Tuple>;
using BDataType = std::tuple_element_t<1, Tuple>;

void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{4, 16, 16, 32, 5}, {8, 16, 16, 32, 8} /**{32, 16, 16, 32, 8},**/};

for(auto length : lengths)
{
bool success = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
};

// clang-format off
using KernelTypes = ::testing::Types<
std::tuple< F16, F16>,
std::tuple< F32, F32>
>;
// clang-format on
using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;

TYPED_TEST_SUITE(TestTranspose, KernelTypes);

//#include "test_transpose_ut_cases.inc"
TYPED_TEST(TestTranspose, Test_FP16) { this->Run(); }
TYPED_TEST(TestTranspose, Test_FP32) { this->Run(); }
28 changes: 0 additions & 28 deletions test/transpose/test_transpose_ut_cases.inc

This file was deleted.

54 changes: 0 additions & 54 deletions test/transpose/test_transpose_util.hpp

This file was deleted.

0 comments on commit aa3e2d7

Please sign in to comment.