diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index c2b74eabee..ba465f64a7 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -38,4 +38,3 @@ The MIOpen API library is structured as follows:
   * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental)
   * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
   * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental)
-  * :doc:`GLU <../doxygen/html/group__glu>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 60d6fe6ce6..48eb4efefb 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -47,6 +47,7 @@ add_executable(MIOpenDriver
     dm_glu.cpp
     dm_groupnorm.cpp
     dm_kthvalue.cpp
+    dm_l1loss.cpp
     dm_layernorm.cpp
     dm_lrn.cpp
     dm_multimarginloss.cpp
diff --git a/driver/dm_l1loss.cpp b/driver/dm_l1loss.cpp
new file mode 100644
index 0000000000..113a09193c
--- /dev/null
+++ b/driver/dm_l1loss.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "l1loss_driver.hpp"
+#include "registry_driver_maker.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "l1loss")
+        return new L1LossDriver<float, float>();
+    if(base_arg == "l1lossfp16")
+        return new L1LossDriver<float16, float>();
+    if(base_arg == "l1lossbfp16")
+        return new L1LossDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index d77d5d02d2..019a412505 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -314,7 +314,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
            "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
            "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], "
-           "multimarginloss[bfp16|fp16]\n");
+           "multimarginloss[bfp16|fp16], l1loss[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -352,7 +352,7 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" &&
        arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" &&
        arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" &&
-       arg != "--version")
+       arg != "l1loss" && arg != "l1lossfp16" && arg != "l1lossbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/l1loss_driver.hpp b/driver/l1loss_driver.hpp
new file mode 100644
index 0000000000..6982cef8cd
--- /dev/null
+++ b/driver/l1loss_driver.hpp
@@ -0,0 +1,362 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/ford.hpp>
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <limits>
+#include <miopen/errors.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+
+#include <vector>
+
+template <typename Tgpu, typename Tcheck>
+int mloL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDesc,
+                                   const Tgpu* input,
+                                   const Tgpu* target,
+                                   Tcheck* outputhost,
+                                   miopenLossReductionMode_t reduction)
+{
+    auto size      = miopen::deref(iDesc).GetElementSize();
+    size_t divisor = (reduction == MIOPEN_LOSS_REDUCTION_MEAN) ? size : 1;
+
+    double output = 0.0;
+    for(size_t i = 0; i < size; i++)
+    {
+        float diff = abs(static_cast<float>(input[i]) - static_cast<float>(target[i]));
+        output += diff;
+    }
+    outputhost[0] = output / divisor;
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tref>
+class L1LossDriver : public Driver
+{
+public:
+    L1LossDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&targetDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    std::vector<int> ComputeStrides(std::vector<int> inputDim);
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~L1LossDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(targetDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+    bool isContiguous;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t targetDesc;
+    miopenTensorDescriptor_t outputDesc;
+
+    std::unique_ptr<GPUMem> in_dev;
+    std::unique_ptr<GPUMem> tar_dev;
+    std::unique_ptr<GPUMem> out_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<Tgpu> in;
+    std::vector<Tgpu> tar;
+    std::vector<Tgpu> out;
+    std::vector<Tgpu> workspace;
+
+    std::vector<Tref> outhost;
+
+    size_t ws_sizeInBytes;
+    miopenLossReductionMode_t reduction;
+};
+
+// Equivalent tensor.transpose(0, -1).contiguous().transpose(0, -1)
+template <typename Tgpu, typename Tref>
+std::vector<int> L1LossDriver<Tgpu, Tref>::ComputeStrides(std::vector<int> inputDim)
+{
+    if(!isContiguous)
+        std::swap(inputDim.front(), inputDim.back());
+    std::vector<int> strides(inputDim.size());
+    strides.back() = 1;
+    for(int i = inputDim.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * inputDim[i + 1];
+    if(!isContiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+    reduction    = static_cast<miopenLossReductionMode_t>(inflags.GetValueInt("reduction"));
+    isContiguous = inflags.GetValueInt("contiguous") > 0 ? true : false;
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::GetandSetData()
+{
+    auto in_len      = inflags.GetValueTensor("dim-lengths").lengths;
+    auto in_strides  = ComputeStrides(in_len);
+    auto tar_strides = ComputeStrides(in_len);
+
+    SetTensorNd(inputDesc, in_len, in_strides, data_type);
+    SetTensorNd(targetDesc, in_len, tar_strides, data_type);
+
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        SetTensorNd(outputDesc, in_len, in_strides, data_type);
+    }
+    else
+    {
+        std::vector<int> out_lens = {1};
+        SetTensorNd(outputDesc, out_lens, data_type);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward L1Loss (Default=1)", "int");
+    inflags.AddTensorFlag(
+        "dim-lengths", 'D', "256x512", "The dimensional lengths of the input tensor");
+    inflags.AddInputFlag("contiguous",
+                         'C',
+                         "1",
+                         "Tensor is contiguous or not (Default=1 for contiguous tensor)",
+                         "int");
+    inflags.AddInputFlag("reduction",
+                         'R',
+                         "2",
+                         "Reduction mode ('none'(0) | 'sum'(1) |'mean'(2)) "
+                         "(Default=2)",
+                         "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t in_sz  = GetTensorSize(inputDesc);
+    size_t tar_sz = GetTensorSize(targetDesc);
+    size_t out_sz = GetTensorSize(outputDesc);
+
+    miopenGetL1LossForwardWorkspaceSize(
+        GetHandle(), reduction, inputDesc, targetDesc, outputDesc, &ws_sizeInBytes);
+
+    if(ws_sizeInBytes == static_cast<size_t>(-1))
+        return miopenStatusAllocFailed;
+
+    size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu);
+
+    uint32_t ctx = 0;
+
+    in_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    tar_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, tar_sz, sizeof(Tgpu)));
+    out_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+    workspace_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte)));
+
+    in        = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    tar       = std::vector<Tgpu>(tar_sz, static_cast<Tgpu>(0));
+    out       = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    workspace = std::vector<Tgpu>(ws_sz, static_cast<Tgpu>(0));
+
+    outhost = std::vector<Tref>(out_sz, static_cast<Tref>(0));
+
+    for(int i = 0; i < in_sz; i++)
+    {
+        in[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(0.2));
+    }
+
+    for(int i = 0; i < tar_sz; i++)
+    {
+        tar[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.01), static_cast<Tgpu>(0.21));
+    }
+
+    fill(out.begin(), out.end(), static_cast<Tgpu>(0));
+
+    if(in_dev->ToGPU(GetStream(), in.data()) != 0)
+        std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl;
+
+    if(tar_dev->ToGPU(GetStream(), tar.data()) != 0)
+        std::cerr << "Error copying (tar) to GPU, size: " << tar_dev->GetSize() << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenStatus_t status = miopenL1LossForward(GetHandle(),
+                                                    reduction,
+                                                    workspace_dev->GetMem(),
+                                                    ws_sizeInBytes,
+                                                    inputDesc,
+                                                    in_dev->GetMem(),
+                                                    targetDesc,
+                                                    tar_dev->GetMem(),
+                                                    outputDesc,
+                                                    out_dev->GetMem());
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenL1LossForward");
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Forward L1Loss Elapsed: " << t.gettime_ms() / iter
+                      << " ms\n";
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Forward L1Loss Elapsed: " << kernel_average_time << " ms\n";
+    }
+
+    if(out_dev->FromGPU(GetStream(), out.data()) != 0)
+        std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    if(reduction == MIOPEN_LOSS_REDUCTION_MEAN || reduction == MIOPEN_LOSS_REDUCTION_SUM)
+    {
+        mloL1LossReducedForwardRunHost<Tgpu, Tref>(
+            inputDesc, in.data(), tar.data(), outhost.data(), reduction);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+Tref L1LossDriver<Tgpu, Tref>::GetTolerance()
+{
+    Tref tolerance = std::numeric_limits<Tgpu>::epsilon() * 10;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(outhost, out);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward L1Loss FAILED: " << error << " > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward L1Loss Verifies OK on CPU reference (" << error << " < " << tolerance
+                  << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int L1LossDriver<Tgpu, Tref>::VerifyBackward()
+{
+    return miopenStatusNotImplemented;
+}
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 2b9b125512..3e8ce6614c 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -8178,6 +8178,60 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+// L1Loss APIs
+#ifdef MIOPEN_BETA_API
+/** @addtogroup LossFunction
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the L1Loss call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param reduction                Reduction mode (input)
+ * @param iDesc                    Tensor descriptor for input tensor (input)
+ * @param tDesc                    Tensor descriptor for target tensor (input)
+ * @param oDesc                    Tensor descriptor for output tensor (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetL1LossForwardWorkspaceSize(miopenHandle_t handle,
+                                    miopenLossReductionMode_t reduction,
+                                    miopenTensorDescriptor_t iDesc,
+                                    miopenTensorDescriptor_t tDesc,
+                                    miopenTensorDescriptor_t oDesc,
+                                    size_t* sizeInBytes);
+
+/*! @brief Execute L1Loss forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param reduction                Reduction mode (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param iDesc                    Tensor descriptor for input tensor (input)
+ * @param i                        Input tensor (input)
+ * @param tDesc                    Tensor descriptor for target tensor (input)
+ * @param t                        Target tensor (input)
+ * @param oDesc                    Tensor descriptor for output tensor (input)
+ * @param o                        Output tensor (output)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenL1LossForward(miopenHandle_t handle,
+                                                 miopenLossReductionMode_t reduction,
+                                                 void* workspace,
+                                                 size_t workspaceSizeInBytes,
+                                                 miopenTensorDescriptor_t iDesc,
+                                                 const void* i,
+                                                 miopenTensorDescriptor_t tDesc,
+                                                 const void* t,
+                                                 miopenTensorDescriptor_t oDesc,
+                                                 void* o);
+
+/** @} */
+// CLOSEOUT LossFunction DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ccb8cbab28..0d2b258685 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -156,6 +156,8 @@ set( MIOpen_Source
     kernel_warnings.cpp
     kthvalue/problem_description.cpp
     kthvalue_api.cpp
+    l1loss/problem_description.cpp
+    l1loss_api.cpp
     layernorm_api.cpp
     layernorm/problem_description.cpp
     load_file.cpp
@@ -306,6 +308,7 @@ set( MIOpen_Source
     solver/groupnorm/forward_groupnorm.cpp
     solver/getitem/backward_getitem.cpp
     solver/kthvalue/forward_kthvalue.cpp
+    solver/l1loss/forward_l1loss.cpp
     solver/layernorm/backward_t5layernorm.cpp
     solver/layernorm/forward_addlayernorm.cpp
     solver/layernorm/forward_layernorm.cpp
@@ -534,7 +537,9 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenGroupNorm.cpp
         kernels/MIOpenGetitem.cpp
         kernels/MIOpenKthvalue.cpp
+        kernels/MIOpenL1Loss.cpp
         kernels/MIOpenLayerNorm.cpp
+        kernels/MIOpenLossReduce.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
         kernels/MIOpenMultiMarginLoss.cpp
@@ -674,6 +679,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         glu.cpp
         kernel_cache.cpp
         kthvalue.cpp
+        l1loss.cpp
         layernorm.cpp
         lrn.cpp
         mlo_dir_conv.cpp
diff --git a/src/include/miopen/l1loss.hpp b/src/include/miopen/l1loss.hpp
new file mode 100644
index 0000000000..07e74d2706
--- /dev/null
+++ b/src/include/miopen/l1loss.hpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/common.hpp>
+#include <miopen/miopen.h>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT size_t GetL1LossForwardWorkspaceSize(Handle& handle,
+                                                             miopenLossReductionMode_t reduction,
+                                                             const TensorDescriptor& iDesc,
+                                                             const TensorDescriptor& tDesc,
+                                                             const TensorDescriptor& oDesc);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t L1LossForward(Handle& handle,
+                                                     miopenLossReductionMode_t reduction,
+                                                     Data_t workspace,
+                                                     size_t workspaceSizeInBytes,
+                                                     const TensorDescriptor& iDesc,
+                                                     ConstData_t i,
+                                                     const TensorDescriptor& tDesc,
+                                                     ConstData_t t,
+                                                     const TensorDescriptor& oDesc,
+                                                     Data_t o);
+
+} // namespace miopen
diff --git a/src/include/miopen/l1loss/invoke_params.hpp b/src/include/miopen/l1loss/invoke_params.hpp
new file mode 100644
index 0000000000..ae8974110b
--- /dev/null
+++ b/src/include/miopen/l1loss/invoke_params.hpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace l1loss {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* iDesc = nullptr;
+    const TensorDescriptor* tDesc = nullptr;
+    const TensorDescriptor* oDesc = nullptr;
+
+    ConstData_t i = nullptr;
+    ConstData_t t = nullptr;
+    Data_t o      = nullptr;
+
+    miopenLossReductionMode_t reduction = MIOPEN_LOSS_REDUCTION_MEAN;
+    Data_t workspace                    = nullptr;
+    std::size_t workspace_size          = 0;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+} // namespace l1loss
+
+} // namespace miopen
diff --git a/src/include/miopen/l1loss/problem_description.hpp b/src/include/miopen/l1loss/problem_description.hpp
new file mode 100644
index 0000000000..125d6da4ee
--- /dev/null
+++ b/src/include/miopen/l1loss/problem_description.hpp
@@ -0,0 +1,108 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace l1loss {
+
+struct FwdProblemDescription : ProblemDescriptionBase
+{
+    FwdProblemDescription(const TensorDescriptor& iDesc_,
+                          const TensorDescriptor& tDesc_,
+                          const TensorDescriptor& oDesc_,
+                          miopenLossReductionMode_t reduction_)
+        : iDesc(iDesc_), tDesc(tDesc_), oDesc(oDesc_), reduction(reduction_)
+    {
+        if(iDesc.GetNumDims() != tDesc.GetNumDims())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "L1Loss::ProblemDescription: Number of dimensions between input tensor "
+                         "and target tensor do not match.");
+        }
+
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            if(iDesc.GetNumDims() != oDesc.GetNumDims())
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "L1Loss::ProblemDescription: Number of dimensions between input "
+                             "tensor and output tensor do not match.");
+            }
+        }
+        else
+        {
+            if(oDesc.GetNumDims() != 1)
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "L1Loss::ProblemDescription: Number of output tensor's dimension do "
+                             "not equal 1 in case of reduction.");
+            }
+        }
+
+        if(!IsSameType())
+        {
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "L1Loss::ProblemDescription: Input, target and output tensor have different "
+                "data type.");
+        }
+    }
+
+    miopenLossReductionMode_t GetReduction() const { return reduction; }
+    const TensorDescriptor& GetIDesc() const { return iDesc; }
+    const TensorDescriptor& GetTDesc() const { return tDesc; }
+    const TensorDescriptor& GetODesc() const { return oDesc; }
+
+    bool IsSameType() const
+    {
+        if(iDesc.GetType() != tDesc.GetType() || iDesc.GetType() != oDesc.GetType())
+        {
+            return false;
+        }
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+protected:
+    TensorDescriptor iDesc;
+    TensorDescriptor tDesc;
+    TensorDescriptor oDesc;
+    miopenLossReductionMode_t reduction;
+
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+} // namespace l1loss
+
+} // namespace miopen
diff --git a/src/include/miopen/l1loss/solvers.hpp b/src/include/miopen/l1loss/solvers.hpp
new file mode 100644
index 0000000000..dac1c95a4a
--- /dev/null
+++ b/src/include/miopen/l1loss/solvers.hpp
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/l1loss/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace l1loss {
+
+using L1LossForwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::l1loss::FwdProblemDescription>;
+
+struct L1LossForward5d final : L1LossForwardSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<L1LossForward5d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::l1loss::FwdProblemDescription& problem) const override;
+
+    bool IsImprovementOverROCm(const ExecutionContext& context,
+                               const miopen::l1loss::FwdProblemDescription& problem) const;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::l1loss::FwdProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::l1loss::FwdProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+} // namespace l1loss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 76a13b051c..1bb7f6fa2e 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -64,7 +64,8 @@ enum class Primitive
     ReLU,
     Kthvalue,
     SoftMarginLoss,
-    MultiMarginLoss
+    MultiMarginLoss,
+    Loss
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/kernels/MIOpenL1Loss.cpp b/src/kernels/MIOpenL1Loss.cpp
new file mode 100644
index 0000000000..e81671485d
--- /dev/null
+++ b/src/kernels/MIOpenL1Loss.cpp
@@ -0,0 +1,64 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+template <typename TIO>
+__device__ void L1LossReducedForward5d_kernel(const TIO* I,
+                                              const TIO* T,
+                                              FLOAT_ACCUM* lsum,
+                                              const size_t divisor,
+                                              tensor_view_t<5> I_tv,
+                                              tensor_view_t<5> T_tv)
+{
+    const size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const float div  = static_cast<float>(divisor);
+    tensor_layout_t<5> input_layout(I_tv, gid);
+
+    if(input_layout.layout[0] >= I_tv.size[0])
+        return;
+
+    size_t Iidx = I_tv.get_tensor_view_idx(input_layout);
+    size_t Tidx = T_tv.get_tensor_view_idx(input_layout);
+
+    FLOAT_ACCUM diff = abs(CVT_FLOAT2ACCUM(I[Iidx]) - CVT_FLOAT2ACCUM(T[Tidx])) / div;
+    lsum[gid]        = diff;
+}
+
+extern "C" __global__ void L1LossReducedForward5d(const IO_TYPE* I,
+                                                  const IO_TYPE* T,
+                                                  FLOAT_ACCUM* lsum,
+                                                  const size_t divisor,
+                                                  tensor_view_t<5> I_tv,
+                                                  tensor_view_t<5> T_tv)
+{
+    L1LossReducedForward5d_kernel<IO_TYPE>(I, T, lsum, divisor, I_tv, T_tv);
+}
diff --git a/src/kernels/MIOpenLossReduce.cpp b/src/kernels/MIOpenLossReduce.cpp
new file mode 100644
index 0000000000..adc1eb2cb1
--- /dev/null
+++ b/src/kernels/MIOpenLossReduce.cpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "warp_reduce.hpp"
+
+template <typename DTYPE>
+__device__ void LossSum(const DTYPE* __restrict__ input, DTYPE* __restrict__ output, uint64_t N)
+{
+    auto gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    FLOAT_ACCUM val = gid < N ? CVT_FLOAT2ACCUM(input[gid]) : CVT_FP32_2ACCUM(0.0f);
+    val             = block_reduce_sum(val);
+
+    if(threadIdx.x == 0)
+        output[blockIdx.x] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void
+ReduceSumLoss(const FLOAT* __restrict__ input, FLOAT* __restrict__ output, uint64_t N)
+{
+    // instantiate the kernel
+    LossSum<FLOAT>(input, output, N);
+}
\ No newline at end of file
diff --git a/src/l1loss.cpp b/src/l1loss.cpp
new file mode 100644
index 0000000000..69fcf827d9
--- /dev/null
+++ b/src/l1loss.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/find_solution.hpp>
+#include <miopen/l1loss.hpp>
+#include <miopen/l1loss/invoke_params.hpp>
+#include <miopen/l1loss/problem_description.hpp>
+#include <miopen/l1loss/solvers.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+size_t GetL1LossForwardWorkspaceSize(Handle& handle,
+                                     miopenLossReductionMode_t reduction,
+                                     const TensorDescriptor& iDesc,
+                                     const TensorDescriptor& tDesc,
+                                     const TensorDescriptor& oDesc)
+{
+    auto ctx           = ExecutionContext{&handle};
+    const auto problem = l1loss::FwdProblemDescription{iDesc, tDesc, oDesc, reduction};
+
+    const auto algo    = AlgorithmName{"L1LossForward"};
+    const auto solvers = solver::SolverContainer<solver::l1loss::L1LossForward5d>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t L1LossForward(Handle& handle,
+                             miopenLossReductionMode_t reduction,
+                             Data_t workspace,
+                             size_t workspaceSizeInBytes,
+                             const TensorDescriptor& iDesc,
+                             ConstData_t i,
+                             const TensorDescriptor& tDesc,
+                             ConstData_t t,
+                             const TensorDescriptor& oDesc,
+                             Data_t o)
+{
+    const auto problem = l1loss::FwdProblemDescription{iDesc, tDesc, oDesc, reduction};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = l1loss::InvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.reduction      = reduction;
+        tmp.iDesc          = &iDesc;
+        tmp.tDesc          = &tDesc;
+        tmp.oDesc          = &oDesc;
+        tmp.i              = i;
+        tmp.t              = t;
+        tmp.o              = o;
+        tmp.workspace      = workspace;
+        tmp.workspace_size = workspaceSizeInBytes;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"L1LossForward"};
+    const auto solvers = solver::SolverContainer<solver::l1loss::L1LossForward5d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/l1loss/problem_description.cpp b/src/l1loss/problem_description.cpp
new file mode 100644
index 0000000000..cbd2b370e0
--- /dev/null
+++ b/src/l1loss/problem_description.cpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/l1loss/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace l1loss {
+
+NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype = iDesc.GetType();
+    auto size        = iDesc.GetElementSize();
+
+    std::ostringstream ss;
+
+    ss << "l1loss_fwd";
+    ss << "reduction" << reduction;
+    ss << "i_dtype" << input_dtype;
+    ss << "size" << size;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace l1loss
+
+} // namespace miopen
diff --git a/src/l1loss_api.cpp b/src/l1loss_api.cpp
new file mode 100644
index 0000000000..4313a4eca4
--- /dev/null
+++ b/src/l1loss_api.cpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/l1loss.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor_ops.hpp>
+
+extern "C" miopenStatus_t miopenGetL1LossForwardWorkspaceSize(miopenHandle_t handle,
+                                                              miopenLossReductionMode_t reduction,
+                                                              const miopenTensorDescriptor_t iDesc,
+                                                              const miopenTensorDescriptor_t tDesc,
+                                                              const miopenTensorDescriptor_t oDesc,
+                                                              size_t* sizeInBytes)
+{
+
+    MIOPEN_LOG_FUNCTION(handle, reduction, iDesc, tDesc, oDesc, sizeInBytes);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) = miopen::GetL1LossForwardWorkspaceSize(miopen::deref(handle),
+                                                                           reduction,
+                                                                           miopen::deref(iDesc),
+                                                                           miopen::deref(tDesc),
+                                                                           miopen::deref(oDesc));
+    });
+}
+
+extern "C" miopenStatus_t miopenL1LossForward(miopenHandle_t handle,
+                                              miopenLossReductionMode_t reduction,
+                                              void* workspace,
+                                              size_t workspaceSizeInBytes,
+                                              const miopenTensorDescriptor_t iDesc,
+                                              const void* i,
+                                              const miopenTensorDescriptor_t tDesc,
+                                              const void* t,
+                                              const miopenTensorDescriptor_t oDesc,
+                                              void* o)
+{
+    MIOPEN_LOG_FUNCTION(
+        handle, reduction, workspace, workspaceSizeInBytes, iDesc, i, tDesc, t, oDesc, o);
+
+    return miopen::try_([&] {
+        miopen::L1LossForward(miopen::deref(handle),
+                              reduction,
+                              DataCast(workspace),
+                              workspaceSizeInBytes,
+                              miopen::deref(iDesc),
+                              DataCast(i),
+                              miopen::deref(tDesc),
+                              DataCast(t),
+                              miopen::deref(oDesc),
+                              DataCast(o));
+    });
+}
diff --git a/src/solver.cpp b/src/solver.cpp
index 856f04a8cf..808aa17409 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -34,6 +34,7 @@
 #include <miopen/groupnorm/solvers.hpp>
 #include <miopen/getitem/solvers.hpp>
 #include <miopen/kthvalue/solvers.hpp>
+#include <miopen/l1loss/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
 #include <miopen/prelu/solvers.hpp>
@@ -672,7 +673,6 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId());
-
     Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId());
@@ -715,6 +715,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              multimarginloss::MultiMarginLossForward{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId());
+
+    Register(registry, ++id, Primitive::Loss, l1loss::L1LossForward5d{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white
     // space between this comment and the newly registered solver(s)!
 }
diff --git a/src/solver/l1loss/forward_l1loss.cpp b/src/solver/l1loss/forward_l1loss.cpp
new file mode 100644
index 0000000000..d9b8d544e5
--- /dev/null
+++ b/src/solver/l1loss/forward_l1loss.cpp
@@ -0,0 +1,243 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/common.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/hipoc_kernel.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/kernel_info.hpp>
+#include <miopen/l1loss/invoke_params.hpp>
+#include <miopen/l1loss/problem_description.hpp>
+#include <miopen/l1loss/solvers.hpp>
+#include <miopen/miopen.h>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#include <cstddef>
+#include <vector>
+
+#define LOCAL_SIZE_FWD 256
+#define LOCAL_SIZE_REDUCE 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace l1loss {
+
+bool L1LossForward5d::IsImprovementOverROCm(
+    const ExecutionContext& /*context*/, const miopen::l1loss::FwdProblemDescription& problem) const
+{
+    if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool L1LossForward5d::IsApplicable(const ExecutionContext& /*context*/,
+                                   const miopen::l1loss::FwdProblemDescription& problem) const
+{
+    if(!IsImprovementOverROCm({}, problem))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+ConvSolution
+L1LossForward5d::GetSolution(const ExecutionContext& /*context*/,
+                             const miopen::l1loss::FwdProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype      = problem.GetODesc().GetType();
+    auto io_dtype   = miopen::GetDataType(dtype);
+    auto input_size = problem.GetIDesc().GetElementSize();
+
+    const auto build_params =
+        KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+                              {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+                              {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+                              {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+                              {"IO_TYPE", io_dtype == "bfloat16" ? "ushort" : io_dtype},
+                              {"REDUCE_SIZE", LOCAL_SIZE_REDUCE}};
+
+    {
+        /* Phase 1: Calculate loss elementwise. (TIO to FLOAT_ACCUM) */
+        size_t xlocalsize = LOCAL_SIZE_FWD;
+        size_t xgridsize  = AlignUp(input_size, xlocalsize);
+
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenL1Loss.cpp";
+        kernel.kernel_name = "L1LossReducedForward5d";
+
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(1);
+        kernel.l_wk.push_back(1);
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(1);
+        kernel.g_wk.push_back(1);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    {
+        /* Phase 2: Reduce sum (FLOAT_ACCUM to FLOAT_ACCUM) */
+        auto _size = input_size;
+
+        do
+        {
+            size_t xlocalsize = LOCAL_SIZE_REDUCE;
+            size_t xgridsize  = AlignUp(_size, xlocalsize);
+
+            auto kernel        = KernelInfo{};
+            kernel.kernel_file = "MIOpenReduceSum.cpp";
+            kernel.kernel_name = "ReduceSumFLOATACCUM";
+
+            kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+            kernel.l_wk.push_back(xlocalsize);
+            kernel.l_wk.push_back(1);
+            kernel.l_wk.push_back(1);
+            kernel.g_wk.push_back(xgridsize);
+            kernel.g_wk.push_back(1);
+            kernel.g_wk.push_back(1);
+
+            result.construction_params.push_back(kernel);
+            _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
+        } while(_size > LOCAL_SIZE_REDUCE);
+
+        /* Reduce sum (FLOAT_ACCUM to TIO) */
+        size_t xlocalsize = LOCAL_SIZE_REDUCE;
+        size_t xgridsize  = AlignUp(_size, xlocalsize);
+
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenReduceSum.cpp";
+        kernel.kernel_name = "ReduceSum";
+
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(1);
+        kernel.l_wk.push_back(1);
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(1);
+        kernel.g_wk.push_back(1);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [input_size, dtype](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::l1loss::InvokeParams>();
+
+            auto elapsed = 0.f;
+            HipEventPtr start, stop;
+
+            const bool profiling = handle_.IsProfilingEnabled();
+            if(profiling)
+            {
+                handle_.EnableProfiling(false);
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            {
+                /* Phase 1: Calculate loss elementwise. */
+                auto I_tv      = get_inner_expanded_tv<5>(deref(params.iDesc));
+                auto T_tv      = get_inner_expanded_tv<5>(deref(params.tDesc));
+                size_t divisor = (params.reduction == MIOPEN_LOSS_REDUCTION_SUM) ? 1 : input_size;
+
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                kernel(params.i, params.t, params.workspace, divisor, I_tv, T_tv);
+            }
+
+            {
+                /* Phase 2: Reduce. */
+                auto _size      = input_size;
+                auto reduce_in  = params.workspace;
+                auto reduce_out = static_cast<Data_t>(static_cast<char*>(params.workspace) +
+                                                      input_size * get_data_size(dtype));
+
+                for(size_t i = 1; i < kernels.size() - 1; ++i)
+                {
+                    decltype(auto) kernel = handle_.Run(kernels[i]);
+
+                    kernel(reduce_in, reduce_out, _size);
+                    std::swap(reduce_in, reduce_out);
+
+                    _size = (_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
+                }
+
+                decltype(auto) kernel = handle_.Run(kernels.back());
+                kernel(reduce_in, params.o, _size);
+
+                if(profiling)
+                {
+                    hipEventRecord(stop.get(), handle_.GetStream());
+                    hipEventSynchronize(stop.get());
+                    hipEventElapsedTime(&elapsed, start.get(), stop.get());
+
+                    // Clean up
+                    hipEventDestroy(start.get());
+                    hipEventDestroy(stop.get());
+                    handle_.ResetKernelTime();
+                    handle_.AccumKernelTime(elapsed);
+
+                    handle_.EnableProfiling(true);
+                }
+            }
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+L1LossForward5d::GetWorkspaceSize(const ExecutionContext& /*context*/,
+                                  const miopen::l1loss::FwdProblemDescription& problem) const
+{
+    if(problem.GetReduction() == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        return 0;
+    }
+
+    size_t input_size = problem.GetIDesc().GetElementSize();
+    return (input_size + (input_size + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE) *
+           get_data_size(miopenFloat);
+}
+
+} // namespace l1loss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_l1loss.hpp b/test/cpu_l1loss.hpp
new file mode 100644
index 0000000000..9e2ef0d121
--- /dev/null
+++ b/test/cpu_l1loss.hpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/miopen.h>
+
+#include "tensor_holder.hpp"
+#include <cstddef>
+
+template <class T>
+void cpu_l1loss_reduced_forward(tensor<T> input,
+                                tensor<T> target,
+                                tensor<T>& ref_output,
+                                miopenLossReductionMode_t reduction)
+{
+    auto inputSize = input.desc.GetElementSize();
+    size_t divisor = (reduction == MIOPEN_LOSS_REDUCTION_SUM) ? 1 : inputSize;
+
+    double output = 0.0;
+    for(size_t i = 0; i < inputSize; i++)
+    {
+        float diff = abs(static_cast<float>(input[i]) - static_cast<float>(target[i]));
+        output += diff;
+    }
+    ref_output[0] = output / divisor;
+}
diff --git a/test/gtest/l1loss.cpp b/test/gtest/l1loss.cpp
new file mode 100644
index 0000000000..a62fb13bc0
--- /dev/null
+++ b/test/gtest/l1loss.cpp
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "l1loss.hpp"
+using float16 = half_float::half;
+
+namespace l1loss {
+
+using GPU_L1Loss_fwd_FP32  = L1LossFwdTest<float>;
+using GPU_L1Loss_fwd_FP16  = L1LossFwdTest<float16>;
+using GPU_L1Loss_fwd_BFP16 = L1LossFwdTest<bfloat16>;
+
+} // namespace l1loss
+using namespace l1loss;
+
+TEST_P(GPU_L1Loss_fwd_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_L1Loss_fwd_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_L1Loss_fwd_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_L1Loss_fwd_FP32, testing::ValuesIn(GenFullTestCases()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_L1Loss_fwd_FP16, testing::ValuesIn(GenFullTestCases()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_L1Loss_fwd_BFP16, testing::ValuesIn(GenFullTestCases()));
diff --git a/test/gtest/l1loss.hpp b/test/gtest/l1loss.hpp
new file mode 100644
index 0000000000..82555572f7
--- /dev/null
+++ b/test/gtest/l1loss.hpp
@@ -0,0 +1,206 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_l1loss.hpp"
+#include "get_handle.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include "random.hpp"
+
+#include <cstddef>
+#include <gtest/gtest.h>
+#include <limits>
+
+#include <miopen/miopen.h>
+#include <miopen/l1loss.hpp>
+
+struct L1LossTestCase
+{
+    std::vector<size_t> dims;
+    miopenLossReductionMode_t reduction;
+    bool isContiguous;
+
+    friend std::ostream& operator<<(std::ostream& os, const L1LossTestCase& tc)
+    {
+        os << "Dims: ";
+        for(auto dim_sz : tc.dims)
+        {
+            os << dim_sz << " ";
+        }
+        return os << " reducion mode: " << tc.reduction << " contiguous: " << tc.isContiguous;
+    }
+
+    L1LossTestCase() {}
+
+    L1LossTestCase(std::vector<size_t> dims_, miopenLossReductionMode_t reduction_, bool cont_)
+        : dims(dims_), reduction(reduction_), isContiguous(cont_)
+    {
+    }
+
+    std::vector<size_t> ComputeStrides() const
+    {
+        std::vector<size_t> inputDim = dims;
+        if(!isContiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!isContiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+inline std::vector<L1LossTestCase> GenFullTestCases()
+{ // n c d h w dim
+    // clang-format off
+    return {
+        {{1, 1, 1, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
+        {{1, 2, 3, 4, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
+        {{1, 1, 1, 257, 1}, MIOPEN_LOSS_REDUCTION_SUM, false},
+        {{2, 10, 128, 64, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false},
+        {{5, 13, 17, 11, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false},
+        {{256, 4, 128, 1, 1}, MIOPEN_LOSS_REDUCTION_MEAN, false},
+        {{256, 4, 128, 1, 1}, MIOPEN_LOSS_REDUCTION_MEAN, true},
+        {{1, 1, 1, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true},
+        {{34, 4, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true},
+        {{4, 7, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true},
+        {{15, 4, 5, 1, 1}, MIOPEN_LOSS_REDUCTION_SUM, true}
+    };
+    // clang-format on
+}
+
+template <typename T>
+struct L1LossFwdTest : public ::testing::TestWithParam<L1LossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle   = get_handle();
+        l1loss_config   = GetParam();
+        auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 99); };
+
+        reduction       = l1loss_config.reduction;
+        auto in_dims    = l1loss_config.dims;
+        auto in_strides = l1loss_config.ComputeStrides();
+        input           = tensor<T>{in_dims, in_strides}.generate(gen_value1);
+
+        auto tar_strides = l1loss_config.ComputeStrides();
+        target           = tensor<T>{in_dims, tar_strides}.generate(gen_value2);
+
+        auto out_lengths =
+            (reduction == MIOPEN_LOSS_REDUCTION_NONE) ? in_dims : std::vector<size_t>{1};
+
+        output = tensor<T>{out_lengths};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_lengths};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ws_sizeInBytes = (reduction == MIOPEN_LOSS_REDUCTION_NONE)
+                             ? 0
+                             : miopen::GetL1LossForwardWorkspaceSize(
+                                   handle, reduction, input.desc, target.desc, output.desc);
+        if(ws_sizeInBytes == static_cast<size_t>(-1))
+            GTEST_SKIP();
+
+        if(ws_sizeInBytes != 0)
+        {
+            std::vector<size_t> workspace_dims;
+            workspace_dims.push_back(ws_sizeInBytes / sizeof(float));
+
+            workspace = tensor<float>{workspace_dims};
+            std::fill(workspace.begin(), workspace.end(), static_cast<float>(0));
+
+            workspace_dev = handle.Write(workspace.data);
+        }
+
+        input_dev  = handle.Write(input.data);
+        target_dev = handle.Write(target.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            cpu_l1loss_reduced_forward<T>(input, target, ref_output, reduction);
+            status         = miopen::L1LossForward(handle,
+                                           reduction,
+                                           workspace_dev.get(),
+                                           ws_sizeInBytes,
+                                           input.desc,
+                                           input_dev.get(),
+                                           target.desc,
+                                           target_dev.get(),
+                                           output.desc,
+                                           output_dev.get());
+            workspace.data = handle.Read<float>(workspace_dev, workspace.data.size());
+        }
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    double GetTolerance()
+    {
+        double tolerance = std::numeric_limits<T>::epsilon() * 10;
+        return tolerance;
+    }
+
+    void Verify()
+    {
+        double threshold = GetTolerance();
+        auto error       = miopen::rms_range(ref_output, output);
+
+        EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error
+                                            << ",  Tolerance: " << threshold * 10;
+    }
+
+    L1LossTestCase l1loss_config;
+
+    tensor<T> input;
+    tensor<T> target;
+    tensor<T> output;
+    tensor<float> workspace;
+    miopenLossReductionMode_t reduction;
+
+    tensor<T> ref_output;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+
+    size_t ws_sizeInBytes;
+};