diff --git a/docs/reference/index.rst b/docs/reference/index.rst index c2b74eabee..ba465f64a7 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -38,4 +38,3 @@ The MIOpen API library is structured as follows: * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental) * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental) * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental) - * :doc:`GLU <../doxygen/html/group__glu>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 60d6fe6ce6..48eb4efefb 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -47,6 +47,7 @@ add_executable(MIOpenDriver dm_glu.cpp dm_groupnorm.cpp dm_kthvalue.cpp + dm_l1loss.cpp dm_layernorm.cpp dm_lrn.cpp dm_multimarginloss.cpp diff --git a/driver/dm_l1loss.cpp b/driver/dm_l1loss.cpp new file mode 100644 index 0000000000..113a09193c --- /dev/null +++ b/driver/dm_l1loss.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "l1loss_driver.hpp" +#include "registry_driver_maker.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "l1loss") + return new L1LossDriver(); + if(base_arg == "l1lossfp16") + return new L1LossDriver(); + if(base_arg == "l1lossbfp16") + return new L1LossDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index d77d5d02d2..019a412505 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -314,7 +314,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], " "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], " - "multimarginloss[bfp16|fp16]\n"); + "multimarginloss[bfp16|fp16], l1loss[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -352,7 +352,7 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" && arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" && arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" && - arg != "--version") + arg != "l1loss" && arg != "l1lossfp16" && arg != "l1lossbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/l1loss_driver.hpp b/driver/l1loss_driver.hpp new file mode 100644 index 0000000000..6982cef8cd --- /dev/null +++ b/driver/l1loss_driver.hpp @@ -0,0 +1,362 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/ford.hpp> +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include +#include + +#include + +template +int mloL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDesc, + const Tgpu* input, + const Tgpu* target, + Tcheck* outputhost, + miopenLossReductionMode_t reduction) +{ + auto size = miopen::deref(iDesc).GetElementSize(); + size_t divisor = (reduction == MIOPEN_LOSS_REDUCTION_MEAN) ? size : 1; + + double output = 0.0; + for(size_t i = 0; i < size; i++) + { + float diff = abs(static_cast(input[i]) - static_cast(target[i])); + output += diff; + } + outputhost[0] = output / divisor; + + return 0; +} + +template +class L1LossDriver : public Driver +{ +public: + L1LossDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&targetDesc); + miopenCreateTensorDescriptor(&outputDesc); + + data_type = miopen_type{}; + } + + std::vector ComputeStrides(std::vector inputDim); + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~L1LossDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(targetDesc); + miopenDestroyTensorDescriptor(outputDesc); + } + +private: + InputFlags inflags; + + int forw; + bool isContiguous; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t targetDesc; + miopenTensorDescriptor_t outputDesc; + + std::unique_ptr in_dev; + std::unique_ptr tar_dev; + std::unique_ptr out_dev; + std::unique_ptr workspace_dev; + + std::vector in; + std::vector tar; + std::vector out; + std::vector workspace; + + std::vector outhost; + + size_t ws_sizeInBytes; + miopenLossReductionMode_t reduction; +}; + +// Equivalent tensor.transpose(0, -1).contiguous().transpose(0, -1) +template +std::vector L1LossDriver::ComputeStrides(std::vector inputDim) +{ + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +int L1LossDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + reduction = static_cast(inflags.GetValueInt("reduction")); + isContiguous = inflags.GetValueInt("contiguous") > 0 ? true : false; + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int L1LossDriver::GetandSetData() +{ + auto in_len = inflags.GetValueTensor("dim-lengths").lengths; + auto in_strides = ComputeStrides(in_len); + auto tar_strides = ComputeStrides(in_len); + + SetTensorNd(inputDesc, in_len, in_strides, data_type); + SetTensorNd(targetDesc, in_len, tar_strides, data_type); + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + SetTensorNd(outputDesc, in_len, in_strides, data_type); + } + else + { + std::vector out_lens = {1}; + SetTensorNd(outputDesc, out_lens, data_type); + } + + return miopenStatusSuccess; +} + +template +int L1LossDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward L1Loss (Default=1)", "int"); + inflags.AddTensorFlag( + "dim-lengths", 'D', "256x512", "The dimensional lengths of the input tensor"); + inflags.AddInputFlag("contiguous", + 'C', + "1", + "Tensor is contiguous or not (Default=1 for contiguous tensor)", + "int"); + inflags.AddInputFlag("reduction", + 'R', + "2", + "Reduction mode ('none'(0) | 'sum'(1) |'mean'(2)) " + "(Default=2)", + "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int L1LossDriver::AllocateBuffersAndCopy() +{ + size_t in_sz = GetTensorSize(inputDesc); + size_t tar_sz = GetTensorSize(targetDesc); + size_t out_sz = GetTensorSize(outputDesc); + + miopenGetL1LossForwardWorkspaceSize( + GetHandle(), reduction, inputDesc, targetDesc, outputDesc, &ws_sizeInBytes); + + if(ws_sizeInBytes == static_cast(-1)) + return miopenStatusAllocFailed; + + size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu); + + uint32_t ctx = 0; + + in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + tar_dev = std::unique_ptr(new GPUMem(ctx, tar_sz, sizeof(Tgpu))); + out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + + in = std::vector(in_sz, static_cast(0)); + tar = std::vector(tar_sz, static_cast(0)); + out = std::vector(out_sz, static_cast(0)); + workspace = std::vector(ws_sz, static_cast(0)); + + outhost = std::vector(out_sz, static_cast(0)); + + for(int i = 0; i < in_sz; i++) + { + in[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(0.2)); + } + + for(int i = 0; i < tar_sz; i++) + { + tar[i] = prng::gen_A_to_B(static_cast(0.01), static_cast(0.21)); + } + + fill(out.begin(), out.end(), static_cast(0)); + + if(in_dev->ToGPU(GetStream(), in.data()) != 0) + std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl; + + if(tar_dev->ToGPU(GetStream(), tar.data()) != 0) + std::cerr << "Error copying (tar) to GPU, size: " << tar_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int L1LossDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenStatus_t status = miopenL1LossForward(GetHandle(), + reduction, + workspace_dev->GetMem(), + ws_sizeInBytes, + inputDesc, + in_dev->GetMem(), + targetDesc, + tar_dev->GetMem(), + outputDesc, + out_dev->GetMem()); + MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenL1LossForward"); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Forward L1Loss Elapsed: " << t.gettime_ms() / iter + << " ms\n"; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Forward L1Loss Elapsed: " << kernel_average_time << " ms\n"; + } + + if(out_dev->FromGPU(GetStream(), out.data()) != 0) + std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int L1LossDriver::RunForwardCPU() +{ + if(reduction == MIOPEN_LOSS_REDUCTION_MEAN || reduction == MIOPEN_LOSS_REDUCTION_SUM) + { + mloL1LossReducedForwardRunHost( + inputDesc, in.data(), tar.data(), outhost.data(), reduction); + } + + return miopenStatusSuccess; +} + +template +int L1LossDriver::RunBackwardGPU() +{ + return miopenStatusNotImplemented; +} + +template +int L1LossDriver::RunBackwardCPU() +{ + return miopenStatusNotImplemented; +} + +template +Tref L1LossDriver::GetTolerance() +{ + Tref tolerance = std::numeric_limits::epsilon() * 10; + return tolerance; +} + +template +int L1LossDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(outhost, out); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward L1Loss FAILED: " << error << " > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward L1Loss Verifies OK on CPU reference (" << error << " < " << tolerance + << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +template +int L1LossDriver::VerifyBackward() +{ + return miopenStatusNotImplemented; +} diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 2b9b125512..3e8ce6614c 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -8178,6 +8178,60 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, // CLOSEOUT LossFunction DOXYGEN GROUP #endif // MIOPEN_BETA_API +// L1Loss APIs +#ifdef MIOPEN_BETA_API +/** @addtogroup LossFunction + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the L1Loss call + * + * @param handle MIOpen Handle (input) + * @param reduction Reduction mode (input) + * @param iDesc Tensor descriptor for input tensor (input) + * @param tDesc Tensor descriptor for target tensor (input) + * @param oDesc Tensor descriptor for output tensor (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetL1LossForwardWorkspaceSize(miopenHandle_t handle, + miopenLossReductionMode_t reduction, + miopenTensorDescriptor_t iDesc, + miopenTensorDescriptor_t tDesc, + miopenTensorDescriptor_t oDesc, + size_t* sizeInBytes); + +/*! @brief Execute L1Loss forward layer + * + * @param handle MIOpen handle (input) + * @param reduction Reduction mode (input) + * @param workspace Address of the allocated workspace data (input) + * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) + * @param iDesc Tensor descriptor for input tensor (input) + * @param i Input tensor (input) + * @param tDesc Tensor descriptor for target tensor (input) + * @param t Target tensor (input) + * @param oDesc Tensor descriptor for output tensor (input) + * @param o Output tensor (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenL1LossForward(miopenHandle_t handle, + miopenLossReductionMode_t reduction, + void* workspace, + size_t workspaceSizeInBytes, + miopenTensorDescriptor_t iDesc, + const void* i, + miopenTensorDescriptor_t tDesc, + const void* t, + miopenTensorDescriptor_t oDesc, + void* o); + +/** @} */ +// CLOSEOUT LossFunction DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ccb8cbab28..0d2b258685 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -156,6 +156,8 @@ set( MIOpen_Source kernel_warnings.cpp kthvalue/problem_description.cpp kthvalue_api.cpp + l1loss/problem_description.cpp + l1loss_api.cpp layernorm_api.cpp layernorm/problem_description.cpp load_file.cpp @@ -306,6 +308,7 @@ set( MIOpen_Source solver/groupnorm/forward_groupnorm.cpp solver/getitem/backward_getitem.cpp solver/kthvalue/forward_kthvalue.cpp + solver/l1loss/forward_l1loss.cpp solver/layernorm/backward_t5layernorm.cpp solver/layernorm/forward_addlayernorm.cpp solver/layernorm/forward_layernorm.cpp @@ -534,7 +537,9 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenGroupNorm.cpp kernels/MIOpenGetitem.cpp kernels/MIOpenKthvalue.cpp + kernels/MIOpenL1Loss.cpp kernels/MIOpenLayerNorm.cpp + kernels/MIOpenLossReduce.cpp kernels/MIOpenLRNBwd.cl kernels/MIOpenLRNFwd.cl kernels/MIOpenMultiMarginLoss.cpp @@ -674,6 +679,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN glu.cpp kernel_cache.cpp kthvalue.cpp + l1loss.cpp layernorm.cpp lrn.cpp mlo_dir_conv.cpp diff --git a/src/include/miopen/l1loss.hpp b/src/include/miopen/l1loss.hpp new file mode 100644 index 0000000000..07e74d2706 --- /dev/null +++ b/src/include/miopen/l1loss.hpp @@ -0,0 +1,53 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT size_t GetL1LossForwardWorkspaceSize(Handle& handle, + miopenLossReductionMode_t reduction, + const TensorDescriptor& iDesc, + const TensorDescriptor& tDesc, + const TensorDescriptor& oDesc); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t L1LossForward(Handle& handle, + miopenLossReductionMode_t reduction, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& iDesc, + ConstData_t i, + const TensorDescriptor& tDesc, + ConstData_t t, + const TensorDescriptor& oDesc, + Data_t o); + +} // namespace miopen diff --git a/src/include/miopen/l1loss/invoke_params.hpp b/src/include/miopen/l1loss/invoke_params.hpp new file mode 100644 index 0000000000..ae8974110b --- /dev/null +++ b/src/include/miopen/l1loss/invoke_params.hpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include + +namespace miopen { + +namespace l1loss { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams() = default; + + const TensorDescriptor* iDesc = nullptr; + const TensorDescriptor* tDesc = nullptr; + const TensorDescriptor* oDesc = nullptr; + + ConstData_t i = nullptr; + ConstData_t t = nullptr; + Data_t o = nullptr; + + miopenLossReductionMode_t reduction = MIOPEN_LOSS_REDUCTION_MEAN; + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +} // namespace l1loss + +} // namespace miopen diff --git a/src/include/miopen/l1loss/problem_description.hpp b/src/include/miopen/l1loss/problem_description.hpp new file mode 100644 index 0000000000..125d6da4ee --- /dev/null +++ b/src/include/miopen/l1loss/problem_description.hpp @@ -0,0 +1,108 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace l1loss { + +struct FwdProblemDescription : ProblemDescriptionBase +{ + FwdProblemDescription(const TensorDescriptor& iDesc_, + const TensorDescriptor& tDesc_, + const TensorDescriptor& oDesc_, + miopenLossReductionMode_t reduction_) + : iDesc(iDesc_), tDesc(tDesc_), oDesc(oDesc_), reduction(reduction_) + { + if(iDesc.GetNumDims() != tDesc.GetNumDims()) + { + MIOPEN_THROW(miopenStatusBadParm, + "L1Loss::ProblemDescription: Number of dimensions between input tensor " + "and target tensor do not match."); + } + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + if(iDesc.GetNumDims() != oDesc.GetNumDims()) + { + MIOPEN_THROW(miopenStatusBadParm, + "L1Loss::ProblemDescription: Number of dimensions between input " + "tensor and output tensor do not match."); + } + } + else + { + if(oDesc.GetNumDims() != 1) + { + MIOPEN_THROW(miopenStatusBadParm, + "L1Loss::ProblemDescription: Number of output tensor's dimension do " + "not equal 1 in case of reduction."); + } + } + + if(!IsSameType()) + { + MIOPEN_THROW( + miopenStatusBadParm, + "L1Loss::ProblemDescription: Input, target and output tensor have different " + "data type."); + } + } + + miopenLossReductionMode_t GetReduction() const { return reduction; } + const TensorDescriptor& GetIDesc() const { return iDesc; } + const TensorDescriptor& GetTDesc() const { return tDesc; } + const TensorDescriptor& GetODesc() const { return oDesc; } + + bool IsSameType() const + { + if(iDesc.GetType() != tDesc.GetType() || iDesc.GetType() != oDesc.GetType()) + { + return false; + } + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +protected: + TensorDescriptor iDesc; + TensorDescriptor tDesc; + TensorDescriptor oDesc; + miopenLossReductionMode_t reduction; + + NetworkConfig MakeForwardNetworkConfig() const; +}; + +} // namespace l1loss + +} // namespace miopen diff --git a/src/include/miopen/l1loss/solvers.hpp b/src/include/miopen/l1loss/solvers.hpp new file mode 100644 index 0000000000..dac1c95a4a --- /dev/null +++ b/src/include/miopen/l1loss/solvers.hpp @@ -0,0 +1,63 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace solver { + +namespace l1loss { + +using L1LossForwardSolverBase = + NonTunableSolverBase; + +struct L1LossForward5d final : L1LossForwardSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::l1loss::FwdProblemDescription& problem) const override; + + bool IsImprovementOverROCm(const ExecutionContext& context, + const miopen::l1loss::FwdProblemDescription& problem) const; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::l1loss::FwdProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::l1loss::FwdProblemDescription& problem) const override; + bool MayNeedWorkspace() const override { return true; } +}; + +} // namespace l1loss + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 76a13b051c..1bb7f6fa2e 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -64,7 +64,8 @@ enum class Primitive ReLU, Kthvalue, SoftMarginLoss, - MultiMarginLoss + MultiMarginLoss, + Loss }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/kernels/MIOpenL1Loss.cpp b/src/kernels/MIOpenL1Loss.cpp new file mode 100644 index 0000000000..e81671485d --- /dev/null +++ b/src/kernels/MIOpenL1Loss.cpp @@ -0,0 +1,64 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +template +__device__ void L1LossReducedForward5d_kernel(const TIO* I, + const TIO* T, + FLOAT_ACCUM* lsum, + const size_t divisor, + tensor_view_t<5> I_tv, + tensor_view_t<5> T_tv) +{ + const size_t gid = blockIdx.x * blockDim.x + threadIdx.x; + const float div = static_cast(divisor); + tensor_layout_t<5> input_layout(I_tv, gid); + + if(input_layout.layout[0] >= I_tv.size[0]) + return; + + size_t Iidx = I_tv.get_tensor_view_idx(input_layout); + size_t Tidx = T_tv.get_tensor_view_idx(input_layout); + + FLOAT_ACCUM diff = abs(CVT_FLOAT2ACCUM(I[Iidx]) - CVT_FLOAT2ACCUM(T[Tidx])) / div; + lsum[gid] = diff; +} + +extern "C" __global__ void L1LossReducedForward5d(const IO_TYPE* I, + const IO_TYPE* T, + FLOAT_ACCUM* lsum, + const size_t divisor, + tensor_view_t<5> I_tv, + tensor_view_t<5> T_tv) +{ + L1LossReducedForward5d_kernel(I, T, lsum, divisor, I_tv, T_tv); +} diff --git a/src/kernels/MIOpenLossReduce.cpp b/src/kernels/MIOpenLossReduce.cpp new file mode 100644 index 0000000000..adc1eb2cb1 --- /dev/null +++ b/src/kernels/MIOpenLossReduce.cpp @@ -0,0 +1,51 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "warp_reduce.hpp" + +template +__device__ void LossSum(const DTYPE* __restrict__ input, DTYPE* __restrict__ output, uint64_t N) +{ + auto gid = blockIdx.x * blockDim.x + threadIdx.x; + + FLOAT_ACCUM val = gid < N ? CVT_FLOAT2ACCUM(input[gid]) : CVT_FP32_2ACCUM(0.0f); + val = block_reduce_sum(val); + + if(threadIdx.x == 0) + output[blockIdx.x] = CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void +ReduceSumLoss(const FLOAT* __restrict__ input, FLOAT* __restrict__ output, uint64_t N) +{ + // instantiate the kernel + LossSum(input, output, N); +} \ No newline at end of file diff --git a/src/l1loss.cpp b/src/l1loss.cpp new file mode 100644 index 0000000000..69fcf827d9 --- /dev/null +++ b/src/l1loss.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +size_t GetL1LossForwardWorkspaceSize(Handle& handle, + miopenLossReductionMode_t reduction, + const TensorDescriptor& iDesc, + const TensorDescriptor& tDesc, + const TensorDescriptor& oDesc) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = l1loss::FwdProblemDescription{iDesc, tDesc, oDesc, reduction}; + + const auto algo = AlgorithmName{"L1LossForward"}; + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t L1LossForward(Handle& handle, + miopenLossReductionMode_t reduction, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& iDesc, + ConstData_t i, + const TensorDescriptor& tDesc, + ConstData_t t, + const TensorDescriptor& oDesc, + Data_t o) +{ + const auto problem = l1loss::FwdProblemDescription{iDesc, tDesc, oDesc, reduction}; + + const auto invoke_params = [&]() { + auto tmp = l1loss::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.reduction = reduction; + tmp.iDesc = &iDesc; + tmp.tDesc = &tDesc; + tmp.oDesc = &oDesc; + tmp.i = i; + tmp.t = t; + tmp.o = o; + tmp.workspace = workspace; + tmp.workspace_size = workspaceSizeInBytes; + return tmp; + }(); + + const auto algo = AlgorithmName{"L1LossForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/l1loss/problem_description.cpp b/src/l1loss/problem_description.cpp new file mode 100644 index 0000000000..cbd2b370e0 --- /dev/null +++ b/src/l1loss/problem_description.cpp @@ -0,0 +1,53 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace l1loss { + +NetworkConfig FwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = iDesc.GetType(); + auto size = iDesc.GetElementSize(); + + std::ostringstream ss; + + ss << "l1loss_fwd"; + ss << "reduction" << reduction; + ss << "i_dtype" << input_dtype; + ss << "size" << size; + + return NetworkConfig{ss.str()}; +} + +} // namespace l1loss + +} // namespace miopen diff --git a/src/l1loss_api.cpp b/src/l1loss_api.cpp new file mode 100644 index 0000000000..4313a4eca4 --- /dev/null +++ b/src/l1loss_api.cpp @@ -0,0 +1,79 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +extern "C" miopenStatus_t miopenGetL1LossForwardWorkspaceSize(miopenHandle_t handle, + miopenLossReductionMode_t reduction, + const miopenTensorDescriptor_t iDesc, + const miopenTensorDescriptor_t tDesc, + const miopenTensorDescriptor_t oDesc, + size_t* sizeInBytes) +{ + + MIOPEN_LOG_FUNCTION(handle, reduction, iDesc, tDesc, oDesc, sizeInBytes); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = miopen::GetL1LossForwardWorkspaceSize(miopen::deref(handle), + reduction, + miopen::deref(iDesc), + miopen::deref(tDesc), + miopen::deref(oDesc)); + }); +} + +extern "C" miopenStatus_t miopenL1LossForward(miopenHandle_t handle, + miopenLossReductionMode_t reduction, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t iDesc, + const void* i, + const miopenTensorDescriptor_t tDesc, + const void* t, + const miopenTensorDescriptor_t oDesc, + void* o) +{ + MIOPEN_LOG_FUNCTION( + handle, reduction, workspace, workspaceSizeInBytes, iDesc, i, tDesc, t, oDesc, o); + + return miopen::try_([&] { + miopen::L1LossForward(miopen::deref(handle), + reduction, + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(iDesc), + DataCast(i), + miopen::deref(tDesc), + DataCast(t), + miopen::deref(oDesc), + DataCast(o)); + }); +} diff --git a/src/solver.cpp b/src/solver.cpp index 856f04a8cf..808aa17409 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -672,7 +673,6 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); - Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId()); Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId()); Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId()); @@ -715,6 +715,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) multimarginloss::MultiMarginLossForward{}.SolverDbId()); Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId()); + + Register(registry, ++id, Primitive::Loss, l1loss::L1LossForward5d{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white // space between this comment and the newly registered solver(s)! } diff --git a/src/solver/l1loss/forward_l1loss.cpp b/src/solver/l1loss/forward_l1loss.cpp new file mode 100644 index 0000000000..d9b8d544e5 --- /dev/null +++ b/src/solver/l1loss/forward_l1loss.cpp @@ -0,0 +1,243 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define LOCAL_SIZE_FWD 256 +#define LOCAL_SIZE_REDUCE 1024 + +namespace miopen { + +namespace solver { + +namespace l1loss { + +bool L1LossForward5d::IsImprovementOverROCm( + const ExecutionContext& /*context*/, const miopen::l1loss::FwdProblemDescription& problem) const +{ + if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE) + { + return false; + } + + return true; +} + +bool L1LossForward5d::IsApplicable(const ExecutionContext& /*context*/, + const miopen::l1loss::FwdProblemDescription& problem) const +{ + if(!IsImprovementOverROCm({}, problem)) + { + return false; + } + + return true; +} + +ConvSolution +L1LossForward5d::GetSolution(const ExecutionContext& /*context*/, + const miopen::l1loss::FwdProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetODesc().GetType(); + auto io_dtype = miopen::GetDataType(dtype); + auto input_size = problem.GetIDesc().GetElementSize(); + + const auto build_params = + KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IO_TYPE", io_dtype == "bfloat16" ? "ushort" : io_dtype}, + {"REDUCE_SIZE", LOCAL_SIZE_REDUCE}}; + + { + /* Phase 1: Calculate loss elementwise. (TIO to FLOAT_ACCUM) */ + size_t xlocalsize = LOCAL_SIZE_FWD; + size_t xgridsize = AlignUp(input_size, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenL1Loss.cpp"; + kernel.kernel_name = "L1LossReducedForward5d"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + } + + { + /* Phase 2: Reduce sum (FLOAT_ACCUM to FLOAT_ACCUM) */ + auto _size = input_size; + + do + { + size_t xlocalsize = LOCAL_SIZE_REDUCE; + size_t xgridsize = AlignUp(_size, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenReduceSum.cpp"; + kernel.kernel_name = "ReduceSumFLOATACCUM"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; + } while(_size > LOCAL_SIZE_REDUCE); + + /* Reduce sum (FLOAT_ACCUM to TIO) */ + size_t xlocalsize = LOCAL_SIZE_REDUCE; + size_t xgridsize = AlignUp(_size, xlocalsize); + + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenReduceSum.cpp"; + kernel.kernel_name = "ReduceSum"; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(1); + kernel.l_wk.push_back(1); + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(1); + kernel.g_wk.push_back(1); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [input_size, dtype](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + auto elapsed = 0.f; + HipEventPtr start, stop; + + const bool profiling = handle_.IsProfilingEnabled(); + if(profiling) + { + handle_.EnableProfiling(false); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + { + /* Phase 1: Calculate loss elementwise. */ + auto I_tv = get_inner_expanded_tv<5>(deref(params.iDesc)); + auto T_tv = get_inner_expanded_tv<5>(deref(params.tDesc)); + size_t divisor = (params.reduction == MIOPEN_LOSS_REDUCTION_SUM) ? 1 : input_size; + + decltype(auto) kernel = handle_.Run(kernels.front()); + kernel(params.i, params.t, params.workspace, divisor, I_tv, T_tv); + } + + { + /* Phase 2: Reduce. */ + auto _size = input_size; + auto reduce_in = params.workspace; + auto reduce_out = static_cast(static_cast(params.workspace) + + input_size * get_data_size(dtype)); + + for(size_t i = 1; i < kernels.size() - 1; ++i) + { + decltype(auto) kernel = handle_.Run(kernels[i]); + + kernel(reduce_in, reduce_out, _size); + std::swap(reduce_in, reduce_out); + + _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; + } + + decltype(auto) kernel = handle_.Run(kernels.back()); + kernel(reduce_in, params.o, _size); + + if(profiling) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + + handle_.EnableProfiling(true); + } + } + }; + }; + + return result; +} + +std::size_t +L1LossForward5d::GetWorkspaceSize(const ExecutionContext& /*context*/, + const miopen::l1loss::FwdProblemDescription& problem) const +{ + if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE) + { + return 0; + } + + size_t input_size = problem.GetIDesc().GetElementSize(); + return (input_size + (input_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE) * + get_data_size(miopenFloat); +} + +} // namespace l1loss + +} // namespace solver + +} // namespace miopen diff --git a/test/cpu_l1loss.hpp b/test/cpu_l1loss.hpp new file mode 100644 index 0000000000..9e2ef0d121 --- /dev/null +++ b/test/cpu_l1loss.hpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +#include "tensor_holder.hpp" +#include + +template +void cpu_l1loss_reduced_forward(tensor input, + tensor target, + tensor& ref_output, + miopenLossReductionMode_t reduction) +{ + auto inputSize = input.desc.GetElementSize(); + size_t divisor = (reduction == MIOPEN_LOSS_REDUCTION_SUM) ? 1 : inputSize; + + double output = 0.0; + for(size_t i = 0; i < inputSize; i++) + { + float diff = abs(static_cast(input[i]) - static_cast(target[i])); + output += diff; + } + ref_output[0] = output / divisor; +} diff --git a/test/gtest/l1loss.cpp b/test/gtest/l1loss.cpp new file mode 100644 index 0000000000..a62fb13bc0 --- /dev/null +++ b/test/gtest/l1loss.cpp @@ -0,0 +1,59 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "l1loss.hpp" +using float16 = half_float::half; + +namespace l1loss { + +using GPU_L1Loss_fwd_FP32 = L1LossFwdTest; +using GPU_L1Loss_fwd_FP16 = L1LossFwdTest; +using GPU_L1Loss_fwd_BFP16 = L1LossFwdTest; + +} // namespace l1loss +using namespace l1loss; + +TEST_P(GPU_L1Loss_fwd_FP32, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_L1Loss_fwd_FP16, Test) +{ + RunTest(); + Verify(); +}; + +TEST_P(GPU_L1Loss_fwd_BFP16, Test) +{ + RunTest(); + Verify(); +}; + +INSTANTIATE_TEST_SUITE_P(Full, GPU_L1Loss_fwd_FP32, testing::ValuesIn(GenFullTestCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_L1Loss_fwd_FP16, testing::ValuesIn(GenFullTestCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_L1Loss_fwd_BFP16, testing::ValuesIn(GenFullTestCases())); diff --git a/test/gtest/l1loss.hpp b/test/gtest/l1loss.hpp new file mode 100644 index 0000000000..82555572f7 --- /dev/null +++ b/test/gtest/l1loss.hpp @@ -0,0 +1,206 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "cpu_l1loss.hpp" +#include "get_handle.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include "random.hpp" + +#include +#include +#include + +#include +#include + +struct L1LossTestCase +{ + std::vector dims; + miopenLossReductionMode_t reduction; + bool isContiguous; + + friend std::ostream& operator<<(std::ostream& os, const L1LossTestCase& tc) + { + os << "Dims: "; + for(auto dim_sz : tc.dims) + { + os << dim_sz << " "; + } + return os << " reducion mode: " << tc.reduction << " contiguous: " << tc.isContiguous; + } + + L1LossTestCase() {} + + L1LossTestCase(std::vector dims_, miopenLossReductionMode_t reduction_, bool cont_) + : dims(dims_), reduction(reduction_), isContiguous(cont_) + { + } + + std::vector ComputeStrides() const + { + std::vector inputDim = dims; + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; + } +}; + +inline std::vector GenFullTestCases() +{ // n c d h w dim + // clang-format off + return { + {{1, 1, 1, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, false}, + {{1, 2, 3, 4, 1}, MIOPEN_LOSS_REDUCTION_SUM, false}, + {{1, 1, 1, 257, 1}, MIOPEN_LOSS_REDUCTION_SUM, false}, + {{2, 10, 128, 64, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false}, + {{5, 13, 17, 11, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false}, + {{256, 4, 128, 1, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false}, + {{256, 4, 128, 1, 1}, MIOPEN_LOSS_REDUCTION_MEAN, true}, + {{1, 1, 1, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true}, + {{34, 4, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true}, + {{4, 7, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true}, + {{15, 4, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true} + }; + // clang-format on +} + +template +struct L1LossFwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + l1loss_config = GetParam(); + auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 99); }; + + reduction = l1loss_config.reduction; + auto in_dims = l1loss_config.dims; + auto in_strides = l1loss_config.ComputeStrides(); + input = tensor{in_dims, in_strides}.generate(gen_value1); + + auto tar_strides = l1loss_config.ComputeStrides(); + target = tensor{in_dims, tar_strides}.generate(gen_value2); + + auto out_lengths = + (reduction == MIOPEN_LOSS_REDUCTION_NONE) ? in_dims : std::vector{1}; + + output = tensor{out_lengths}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_lengths}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + ws_sizeInBytes = (reduction == MIOPEN_LOSS_REDUCTION_NONE) + ? 0 + : miopen::GetL1LossForwardWorkspaceSize( + handle, reduction, input.desc, target.desc, output.desc); + if(ws_sizeInBytes == static_cast(-1)) + GTEST_SKIP(); + + if(ws_sizeInBytes != 0) + { + std::vector workspace_dims; + workspace_dims.push_back(ws_sizeInBytes / sizeof(float)); + + workspace = tensor{workspace_dims}; + std::fill(workspace.begin(), workspace.end(), static_cast(0)); + + workspace_dev = handle.Write(workspace.data); + } + + input_dev = handle.Write(input.data); + target_dev = handle.Write(target.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + { + cpu_l1loss_reduced_forward(input, target, ref_output, reduction); + status = miopen::L1LossForward(handle, + reduction, + workspace_dev.get(), + ws_sizeInBytes, + input.desc, + input_dev.get(), + target.desc, + target_dev.get(), + output.desc, + output_dev.get()); + workspace.data = handle.Read(workspace_dev, workspace.data.size()); + } + + EXPECT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + } + + double GetTolerance() + { + double tolerance = std::numeric_limits::epsilon() * 10; + return tolerance; + } + + void Verify() + { + double threshold = GetTolerance(); + auto error = miopen::rms_range(ref_output, output); + + EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error + << ", Tolerance: " << threshold * 10; + } + + L1LossTestCase l1loss_config; + + tensor input; + tensor target; + tensor output; + tensor workspace; + miopenLossReductionMode_t reduction; + + tensor ref_output; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr target_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + + size_t ws_sizeInBytes; +};