diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp index 7745ddf34c..e939ce8dfe 100644 --- a/client_example/06_softmax/softmax4d.cpp +++ b/client_example/06_softmax/softmax4d.cpp @@ -47,8 +47,8 @@ int main(int argc, char* argv[]) ck::index_t num_elements = std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies()); - AccDataType alpha{2.0f}; - AccDataType beta{2.0f}; + double alpha{2.0}; + double beta{2.0}; SimpleDeviceMem in(sizeof(InDataType) * num_elements); SimpleDeviceMem out(sizeof(OutDataType) * num_elements); @@ -82,8 +82,8 @@ int main(int argc, char* argv[]) auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, in_strides, reduce_dims, - &alpha, - &beta, + alpha, + beta, in.GetDeviceBuffer(), out.GetDeviceBuffer(), PassThrough{}, @@ -129,8 +129,8 @@ int main(int argc, char* argv[]) auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, in_strides, reduce_dims, - &alpha, - &beta, + alpha, + beta, in.GetDeviceBuffer(), out.GetDeviceBuffer(), PassThrough{}, @@ -147,4 +147,4 @@ int main(int argc, char* argv[]) } return 0; -} \ No newline at end of file +} diff --git a/client_example/15_reduce/reduce_nhwc_c.cpp b/client_example/15_reduce/reduce_nhwc_c.cpp index 8f4902ae25..2275158bcb 100644 --- a/client_example/15_reduce/reduce_nhwc_c.cpp +++ b/client_example/15_reduce/reduce_nhwc_c.cpp @@ -61,8 +61,8 @@ int main(int argc, char* argv[]) for(auto dim : reduce_dims) reduce_length *= in_lengths[dim]; - float alpha{1.0f}; - float beta{0.0f}; + double alpha{1.0}; + double beta{0.0}; SimpleDeviceMem in(sizeof(InDataType) * num_in_elements); SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements); diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp index 6df549448d..e6e3cc8d52 100644 --- a/example/12_reduce/reduce_blockwise_impl.hpp +++ b/example/12_reduce/reduce_blockwise_impl.hpp @@ -267,8 +267,8 @@ int reduce_blockwise_impl(bool do_verification, arrOutLengths, arrOutStrides, reduceDims, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in.mData.data(), nullptr, out_ref.mData.data(), @@ -295,8 +295,8 @@ int reduce_blockwise_impl(bool do_verification, arrOutLengths, arrOutStrides, reduceDims, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in_dev.GetDeviceBuffer(), nullptr, out_dev.GetDeviceBuffer(), diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index a86ea7b56a..dbb18a0d83 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -226,8 +226,8 @@ int main(int argc, char* argv[]) arrOutLengths, arrOutStrides, reduceDims, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in_1.mData.data(), nullptr, out_ref.mData.data(), @@ -254,8 +254,8 @@ int main(int argc, char* argv[]) arrInLengths_2, arrInStrides_2, reduceDims_1, - 1.0f, - 0.0f, + 1.0, + 0.0, in_1_dev.GetDeviceBuffer(), nullptr, in_2_dev.GetDeviceBuffer(), @@ -278,8 +278,8 @@ int main(int argc, char* argv[]) arrOutLengths, arrOutStrides, reduceDims_2, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in_2_dev.GetDeviceBuffer(), nullptr, out_dev.GetDeviceBuffer(), diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp index 100a20d2a2..905242fb6b 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp +++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp @@ -180,8 +180,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, arrOutLengths, arrOutStrides, reduceDims, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in.mData.data(), nullptr, out_ref.mData.data(), @@ -208,8 +208,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, arrOutLengths, arrOutStrides, reduceDims, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in_dev.GetDeviceBuffer(), nullptr, out_dev.GetDeviceBuffer(), diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp index 8854bf047b..41afd72f5a 100644 --- a/example/23_softmax/softmax_blockwise.cpp +++ b/example/23_softmax/softmax_blockwise.cpp @@ -56,8 +56,8 @@ class SimpleAppArgs int option_index = 0; public: - std::vector inLengths = {8, 128, 2048}; - std::vector scales = {2.0f, 2.0f}; + std::vector inLengths = {8, 128, 2048}; + std::vector scales = {2.0, 2.0}; bool do_verification = true; int init_method = 2; @@ -151,8 +151,8 @@ int main(int argc, char* argv[]) auto inStrides = in.mDesc.GetStrides(); auto outStrides = out.mDesc.GetStrides(); - AccDataType alpha = args.scales[0]; - AccDataType beta = args.scales[1]; + double alpha = args.scales[0]; + double beta = args.scales[1]; std::cout << "in: " << in.mDesc << std::endl; std::cout << "out: " << out.mDesc << std::endl; @@ -221,8 +221,8 @@ int main(int argc, char* argv[]) auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths, i_inStrides, reduceDims, - &alpha, - &beta, + alpha, + beta, in_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(), PassThrough{}, diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp index 376b95ea7b..326606752b 100644 --- a/example/33_multiple_reduce/dual_reduce_common.hpp +++ b/example/33_multiple_reduce/dual_reduce_common.hpp @@ -217,8 +217,8 @@ int mean_meansquare_dual_reduce_test(size_t n, size_t invariant_total_length = n; size_t reduce_total_length = h * w * c; - const AccDataType alpha = ck::type_convert(1.0f); - const AccDataType beta = ck::type_convert(0.0f); + const double alpha = 1.0f; + const double beta = 0.0f; std::size_t num_thread = 1; @@ -267,8 +267,8 @@ int mean_meansquare_dual_reduce_test(size_t n, i_outLengths, {i_outStrides, i_outStrides}, reduceDims, - {&alpha, &alpha}, - {&beta, &beta}, + {alpha, alpha}, + {beta, beta}, in_dev.GetDeviceBuffer(), {mean_dev.GetDeviceBuffer(), meansquare_dev.GetDeviceBuffer()}, ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}), diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp index d8a791c322..9491a92247 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp @@ -32,7 +32,7 @@ struct DeviceElementwiseNormalization : public BaseOperator const std::vector betaStrides, const std::vector yStrides, const std::vector reduceDims, - AccDataType epsilon, + double epsilon, const std::array in_dev_buffers, const void* p_gamma, const void* p_beta, diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp index 93202e352e..ee4b53e2fc 100644 --- a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp @@ -32,8 +32,8 @@ struct DeviceMultipleReduce : public BaseOperator const std::array outLengths, const std::array, NumReduction> outStrides, const std::array reduceDims, - const std::array alphas, - const std::array betas, + const std::array alphas, + const std::array betas, const void* in_dev, const std::array out_dev_buffers, const InElementwiseOperationTuple in_elementwise_op_tuple, diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp index 227c352cbd..ec17ec3d18 100644 --- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp +++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp @@ -28,7 +28,7 @@ struct DeviceNormalization : public BaseOperator const std::vector betaStrides, const std::vector yStrides, const std::vector reduceDims, - AccDataType epsilon, + double epsilon, const void* p_x, const void* p_gamma, const void* p_beta, diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp index 531d0d0f81..c9209f2d7d 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp @@ -33,8 +33,8 @@ struct DeviceReduce : public BaseOperator const std::array outLengths, const std::array outStrides, const std::array reduceDims, - float alpha, - float beta, + double alpha, + double beta, const void* in_dev, const void* in_index_dev, void* out_dev, diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp index 676e0812b7..94f788e517 100644 --- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp +++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp @@ -27,10 +27,8 @@ struct DeviceSoftmax : public BaseOperator // @param[in] inLengths Input tensor extent(s) from high to low dimension // @param[in] inStrides Input tensor stride(s) from high to low dimension // @param[in] reduceDims The dimension(s) the normalization operation is applied - // @param[in] alpha Typeless pointer in host memory storing the alpha scaling - // value as type AccDataType - // @param[in] beta Typeless pointer in host memory storing the beta scaling - // value as type AccDataType + // @param[in] alpha double type value + // @param[in] beta double type value // @param[in] in_dev Typeless const pointer in device memory storing the input // tensor // @param out_dev Typeless pointer in device memory storing the output tensor @@ -43,8 +41,8 @@ struct DeviceSoftmax : public BaseOperator MakeArgumentPointer(const std::vector inLengths, const std::vector inStrides, const std::vector reduceDims, - const void* alpha, - const void* beta, + double alpha, + double beta, const void* in_dev, void* out_dev, InElementwiseOp in_elementwise_op, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp index 8ffc5ef9fb..1085bdf922 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp @@ -270,18 +270,18 @@ struct DeviceElementwiseNormalizationImpl const std::vector reduceDims, XElementwiseOperation x_elementwise_op, YElementwiseOperation y_elementwise_op, - AccDataType epsilon, + double epsilon, const std::array in_dev_buffers, const GammaDataType* p_gamma, const BetaDataType* p_beta, YDataType* p_y) - : epsilon_(epsilon), - p_gamma_(p_gamma), + : p_gamma_(p_gamma), p_beta_(p_beta), p_y_(p_y), x_elementwise_op_(x_elementwise_op), y_elementwise_op_(y_elementwise_op) { + epsilon_ = static_cast(epsilon); Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); for(int i = 0; i < NumInput; i++) @@ -543,7 +543,7 @@ struct DeviceElementwiseNormalizationImpl const std::vector betaStrides, const std::vector yStrides, const std::vector reduceDims, - AccDataType epsilon, + double epsilon, const std::array in_dev_buffers, const void* p_gamma, const void* p_beta, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp index 6b730b1265..b49e109682 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp @@ -270,8 +270,8 @@ struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce& outLengths, const std::array, NumReduction>& outStridesArray, const std::array& reduceDims, - const std::array& alphas, - const std::array& betas, + const std::array& alphas, + const std::array& betas, const void* in_dev, const std::array& out_dev_buffers, const InElementwiseOperationTuple in_elementwise_op_tuple, @@ -286,8 +286,8 @@ struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce(alphas[i]); - beta_values_(i) = *static_cast(betas[i]); + alpha_values_(i) = static_cast(alphas[i]); + beta_values_(i) = static_cast(betas[i]); }; in_dev_ = static_cast(in_dev); @@ -547,8 +547,8 @@ struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce outLengths, const std::array, NumReduction> outStridesArray, const std::array reduceDims, - const std::array alphas, - const std::array betas, + const std::array alphas, + const std::array betas, const void* in_dev, const std::array out_dev_buffers, const InElementwiseOperationTuple in_elementwise_op_tuple, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp index ff8465e9fc..17a96e9f6f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp @@ -195,8 +195,8 @@ struct DeviceMultipleReduceThreadWise : public DeviceMultipleReduce& outLengths, const std::array, NumReduction>& outStridesArray, const std::array& reduceDims, - const std::array& alphas, - const std::array& betas, + const std::array& alphas, + const std::array& betas, const void* in_dev, const std::array& out_dev_buffers, const InElementwiseOperationTuple in_elementwise_op_tuple, @@ -211,8 +211,8 @@ struct DeviceMultipleReduceThreadWise : public DeviceMultipleReduce(alphas[i]); - beta_values_(i) = *static_cast(betas[i]); + alpha_values_(i) = static_cast(alphas[i]); + beta_values_(i) = static_cast(betas[i]); }; in_dev_ = static_cast(in_dev); @@ -374,8 +374,8 @@ struct DeviceMultipleReduceThreadWise : public DeviceMultipleReduce outLengths, const std::array, NumReduction> outStridesArray, const std::array reduceDims, - const std::array alphas, - const std::array betas, + const std::array alphas, + const std::array betas, const void* in_dev, const std::array out_dev_buffers, const InElementwiseOperationTuple in_elementwise_op_tuple, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp index 47d9df8025..8cc223a886 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp @@ -221,18 +221,19 @@ struct DeviceNormalizationImpl : public DeviceNormalization yStrides, const std::vector reduceDims, AccElementwiseOperation acc_elementwise_op, - AccDataType epsilon, + double epsilon, const XDataType* p_x, const GammaDataType* p_gamma, const BetaDataType* p_beta, YDataType* p_y) - : epsilon_(epsilon), - p_x_(p_x), + : p_x_(p_x), p_gamma_(p_gamma), p_beta_(p_beta), p_y_(p_y), acc_elementwise_op_(acc_elementwise_op) { + epsilon_ = static_cast(epsilon); + Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); xStrides_ = shuffle_tensor_dimensions(xStrides, reduceDims); yStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); @@ -421,7 +422,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization betaStrides, const std::vector yStrides, const std::vector reduceDims, - AccDataType epsilon, + double epsilon, const void* p_x, const void* p_gamma, const void* p_beta, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp index 8abe8884a1..c7868537fe 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp @@ -217,8 +217,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce outLengths, const std::array outStrides, const std::array reduceDims, - float alpha, - float beta, + double alpha, + double beta, const InDataType* in_dev, const IndexDataType* in_index_dev, OutDataType* out_dev, @@ -502,8 +502,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce outLengths, const std::array outStrides, const std::array reduceDims, - float alpha, - float beta, + double alpha, + double beta, const void* in_dev, const void* in_index_dev, void* out_dev, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp index 888485228a..a1d976f1a1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp @@ -165,8 +165,8 @@ struct DeviceReduceThreadWise : public DeviceReduce outLengths, const std::array outStrides, const std::array reduceDims, - float alpha, - float beta, + double alpha, + double beta, const InDataType* in_dev, OutDataType* out_dev, IndexDataType* out_index_dev, @@ -341,8 +341,8 @@ struct DeviceReduceThreadWise : public DeviceReduce outLengths, const std::array outStrides, const std::array reduceDims, - float alpha, - float beta, + double alpha, + double beta, const void* in_dev, const void* in_index_dev, void* out_dev, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp index 8630a2c6e2..ed96b7340c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp @@ -156,19 +156,20 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax inLengths, const std::vector inStrides, const std::vector reduceDims, - AccDataType alpha, - AccDataType beta, + double alpha, + double beta, const InDataType* in_dev, OutDataType* out_dev, InElementwiseOp in_elementwise_op, AccElementwiseOp acc_elementwise_op) - : alpha_{alpha}, - beta_{beta}, - in_dev_{in_dev}, + : in_dev_{in_dev}, out_dev_{out_dev}, in_elementwise_op_{in_elementwise_op}, acc_elementwise_op_{acc_elementwise_op} { + alpha_ = static_cast(alpha); + beta_ = static_cast(beta); + if(Rank != inLengths.size() || Rank != inStrides.size() || NumReduceDim != reduceDims.size()) { @@ -336,8 +337,8 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax inLengths, const std::vector inStrides, const std::vector reduceDims, - const AccDataType alpha, - const AccDataType beta, + double alpha, + double beta, const InDataType* in_dev, OutDataType* out_dev, InElementwiseOp in_elementwise_op, @@ -375,8 +376,8 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax MakeArgumentPointer(const std::vector inLengths, const std::vector inStrides, const std::vector reduceDims, - const void* alpha, - const void* beta, + double alpha, + double beta, const void* in_dev, void* out_dev, InElementwiseOp in_elementwise_op, @@ -385,8 +386,8 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax(inLengths, inStrides, reduceDims, - *static_cast(alpha), - *static_cast(beta), + alpha, + beta, static_cast(in_dev), static_cast(out_dev), in_elementwise_op, diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp index c83523f0d1..c04baca574 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp @@ -56,8 +56,8 @@ struct ReferenceReduce : public device::DeviceReduce outLengths, const std::array outStrides, const std::array reduceDims, - float alpha, - float beta, + double alpha, + double beta, const InDataType* in_host, OutDataType* out_host, IndexDataType* out_index_host, @@ -388,8 +388,8 @@ struct ReferenceReduce : public device::DeviceReduce outLengths, const std::array outStrides, const std::array reduceDims, - float alpha, - float beta, + double alpha, + double beta, const void* in_host, const void* in_index_host, void* out_host, diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp index 4839eb8ade..a4fd46c932 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp @@ -24,11 +24,14 @@ struct ReferenceSoftmax : public device::BaseOperator { Argument(const Tensor& in, Tensor& out, - AccDataType alpha, - AccDataType beta, + double alpha, + double beta, const std::vector sm_reduce_dims) - : in_(in), out_(out), alpha_(alpha), beta_(beta), sm_reduce_dims_(sm_reduce_dims) + : in_(in), out_(out), sm_reduce_dims_(sm_reduce_dims) { + alpha_ = static_cast(alpha); + beta_ = static_cast(beta); + // std::cout << "debug: scalar dims: "; for(size_t i = 0; i < in.mDesc.GetNumOfDimension(); i++) { @@ -143,8 +146,8 @@ struct ReferenceSoftmax : public device::BaseOperator static auto MakeArgument(const Tensor& in, Tensor& out, - AccDataType alpha, - AccDataType beta, + double alpha, + double beta, const std::vector sm_reduce_dims) { return Argument{in, out, alpha, beta, sm_reduce_dims}; diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp index 0759c53a3c..e618200299 100644 --- a/profiler/include/profiler/profile_reduce_impl.hpp +++ b/profiler/include/profiler/profile_reduce_impl.hpp @@ -332,8 +332,8 @@ bool profile_reduce_impl_impl(bool do_verification, arrOutLengths, arrOutStrides, reduceDims, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in.mData.data(), nullptr, out_ref.mData.data(), @@ -361,8 +361,8 @@ bool profile_reduce_impl_impl(bool do_verification, arrOutLengths, arrOutStrides, reduceDims, - alpha, - beta, + static_cast(alpha), + static_cast(beta), in_dev.GetDeviceBuffer(), nullptr, out_dev.GetDeviceBuffer(), diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp index 090cdaaa9a..96816f53bb 100644 --- a/profiler/include/profiler/profile_softmax_impl.hpp +++ b/profiler/include/profiler/profile_softmax_impl.hpp @@ -48,8 +48,8 @@ bool profile_softmax_impl(int do_verification, std::vector in_length, std::vector in_strides, std::vector reduce_dims, - AccDataType alpha, - AccDataType beta) + double alpha, + double beta) { if(Rank != in_length.size()) { @@ -122,8 +122,8 @@ bool profile_softmax_impl(int do_verification, auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths, in_tensor_strides, reduce_dims, - &alpha, - &beta, + alpha, + beta, in_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(), PassThrough{}, diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp index 30f627dd29..78b64dda7d 100644 --- a/profiler/src/profile_softmax.cpp +++ b/profiler/src/profile_softmax.cpp @@ -99,8 +99,8 @@ int profile_softmax(int argc, char* argv[]) length, stride, reduce, - float(alpha), - float(beta)); + double(alpha), + double(beta)); } else if(data_type == SoftmaxDataType::F32_F32) { @@ -111,8 +111,8 @@ int profile_softmax(int argc, char* argv[]) length, stride, reduce, - float(alpha), - float(beta)); + double(alpha), + double(beta)); } else { @@ -131,8 +131,8 @@ int profile_softmax(int argc, char* argv[]) length, stride, reduce, - float(alpha), - float(beta)); + double(alpha), + double(beta)); } else if(data_type == SoftmaxDataType::F32_F32) { @@ -143,8 +143,8 @@ int profile_softmax(int argc, char* argv[]) length, stride, reduce, - float(alpha), - float(beta)); + double(alpha), + double(beta)); } else {