diff --git a/impl/ascend/aclnn/adaptor.hpp b/impl/ascend/aclnn/adaptor.hpp index 603d3a330..4f8d8b2ca 100644 --- a/impl/ascend/aclnn/adaptor.hpp +++ b/impl/ascend/aclnn/adaptor.hpp @@ -76,6 +76,7 @@ inline aclTensor* createAclTensorFromDiopiTensor(diopiConstTensorHandle_t tensor if (tensor == nullptr) { return nullptr; } + diopiSize_t shape{}; diopiGetTensorShape(tensor, &shape); diopiSize_t stride{}; diff --git a/impl/ascend/ascend_tensor.cpp b/impl/ascend/ascend_tensor.cpp index b18269325..bdefeb2a7 100644 --- a/impl/ascend/ascend_tensor.cpp +++ b/impl/ascend/ascend_tensor.cpp @@ -234,6 +234,8 @@ aclFormat inferAclDataFormat(int64_t dim, const int64_t* shape, const int64_t* s return ACL_FORMAT_NHWC; } std::call_once(warningFlag, warnOnUnsupportedFormat, __FILE__, __LINE__, __FUNCTION__); + } else if (dim == 3) { + return ACL_FORMAT_NCL; } return ACL_FORMAT_ND; } diff --git a/impl/ascend/device_configs.py b/impl/ascend/device_configs.py index 6ba6937d0..abb51d125 100755 --- a/impl/ascend/device_configs.py +++ b/impl/ascend/device_configs.py @@ -808,6 +808,11 @@ rtol=5e-2, atol_half=5e-2, rtol_half=5e-2, + para=dict( + # for aclnnGroupNorm, eps must be larger than 0. + # aclnnGoupNorm do not support float16 input + eps=[Skip(-1), Skip(0)], + ), tensor_para=dict( args=[ { diff --git a/impl/ascend/functions/batch_norm.cpp b/impl/ascend/functions/batch_norm.cpp index 29f392b68..d6a109e89 100644 --- a/impl/ascend/functions/batch_norm.cpp +++ b/impl/ascend/functions/batch_norm.cpp @@ -4,146 +4,48 @@ * @copyright (c) 2023, DeepLink. */ -#include "../common/acloprunner.hpp" +#include "../aclnn/acl_scalar.hpp" +#include "../aclnn/adaptor.hpp" namespace impl { namespace ascend { -void updateInputAscendTensorDim(AscendTensor& inputAt, bool training) { - int64_t dim = inputAt.dim(); - if (2 == dim) { - inputAt.unsqueeze(2); - inputAt.unsqueeze(3); - } else if (3 == dim) { - inputAt.unsqueeze(3); - } else if (5 == dim && !training) { - std::vector shape4d{inputAt.shape(0), inputAt.shape(1), inputAt.shape(2), inputAt.shape(3) * inputAt.shape(4)}; - inputAt.view(shape4d); - } -} - -void batchNormBackwardTrainingUpdate(diopiContextHandle_t ctx, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias, AscendTensor gradOutputAt, - AscendTensor inputAt, diopiConstTensorHandle_t saveMean, diopiConstTensorHandle_t saveInvstd, double eps) { - std::string name = (inputAt.dim() == 5) ? "BN3DTrainingUpdateGrad" : "BNTrainingUpdateGrad"; - AclOpRunner<4, 2>(name, ctx) - .addInput(gradOutputAt) - .addInput(inputAt) - .addInput(saveMean) - .addInput(saveInvstd) - .addOutput(gradWeight) - .addOutput(gradBias) - .setAttr("epsilon", static_cast(eps)) - .run(); -} - -void batchNormBackwardTrainingReduceNocheck(diopiContextHandle_t ctx, AscendTensor gradInputAt, diopiConstTensorHandle_t gradWeight, - diopiConstTensorHandle_t gradBias, AscendTensor gradOutputAt, AscendTensor inputAt, diopiConstTensorHandle_t weight, - diopiConstTensorHandle_t saveMean, diopiConstTensorHandle_t saveInvstd, double eps) { - std::string name = (inputAt.dim() == 5) ? "BN3DTrainingReduceGrad" : "BNTrainingReduceGrad"; - AclOpRunner<7, 1>(name, ctx) - .addInput(gradOutputAt) - .addInput(inputAt) - .addInput(gradWeight) - .addInput(gradBias) - .addInput(weight) - .addInput(saveMean) - .addInput(saveInvstd) - .addOutput(gradInputAt) - .setAttr("epsilon", static_cast(eps)) - .run(); -} - diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t saveMean, diopiTensorHandle_t saveInvstd, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiTensorHandle_t runningMean, diopiTensorHandle_t runningVar, bool training, double momentum, double eps) { - AscendTensor inputAt(input), outputAt(out); - updateInputAscendTensorDim(inputAt, training); - outputAt.view(inputAt.getAclMemShape()); - - std::vector batchShapeV{inputAt.shape(1)}; - diopiSize_t batchShapeSizeT{batchShapeV.data(), static_cast(batchShapeV.size())}; - diopiTensorHandle_t weightTemp = createTensorIfNullptrOrConstCast(ctx, weight, batchShapeSizeT, inputAt.dtype(), true, 1); - diopiTensorHandle_t biasTemp = createTensorIfNullptrOrConstCast(ctx, bias, batchShapeSizeT, inputAt.dtype(), true, 0); - diopiTensorHandle_t runningMeanTemp = createTensorIfNullptrOrConstCast(ctx, runningMean, batchShapeSizeT, inputAt.dtype(), true, 0); - diopiTensorHandle_t runningVarTemp = createTensorIfNullptrOrConstCast(ctx, runningVar, batchShapeSizeT, inputAt.dtype(), true, 1); - - if (!training) { - AclOpRunner<5, 1>("BNInfer", ctx) - .addInput(inputAt) - .addInput(weightTemp) - .addInput(biasTemp) - .addInput(runningMeanTemp) - .addInput(runningVarTemp) - .addOutput(outputAt) - .setAttr("epsilon", static_cast(eps)) - .run(); - - diopiTensorHandle_t runningVarBroadcasted; - makeTensorLike(ctx, &runningVarBroadcasted, input); - AscendTensor runningVarAt(runningVar); - runningVarAt.unsqueeze(0); - runningVarAt.unsqueeze(2); - runningVarAt.unsqueeze(3); - AclOpRunner<2, 1>("BroadcastTo", ctx).addInput(runningVarAt).addConstInput(inputAt.shape()).addOutput(runningVarBroadcasted).run(); - } else { - diopiTensorHandle_t sum = nullptr, squareSum = nullptr; - diopiSize_t shape, stride; - diopiGetTensorShape(runningMeanTemp, &shape); - diopiGetTensorStride(runningMeanTemp, &stride); - diopiRequireTensor(ctx, &sum, &shape, &stride, diopiDtype_t::diopi_dtype_float32, diopi_device); - diopiRequireTensor(ctx, &squareSum, &shape, &stride, diopiDtype_t::diopi_dtype_float32, diopi_device); - AclOpRunner<1, 2>("BNTrainingReduce", ctx).addInput(inputAt).addOutput(sum).setAttr("epsilon", static_cast(eps)).addOutput(squareSum).run(); - AclOpRunner<7, 5>("BNTrainingUpdate", ctx) - .addInput(inputAt) - .addInput(sum) - .addInput(squareSum) - .addInput(weightTemp) - .addInput(biasTemp) - .addInput(runningMeanTemp) - .addInput(runningVarTemp) - .setAttr("epsilon", static_cast(eps)) - .setAttr("factor", static_cast(momentum)) - .addOutput(outputAt) - .addOutput(runningMeanTemp) - .addOutput(runningVarTemp) - .addOutput(saveMean) - .addOutput(saveInvstd) - .run(); - } + DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNorm, ctx, input, weight, bias, runningMean, runningVar, training, momentum, eps, out, saveMean, saveInvstd); return diopiSuccess; } diopiError_t diopiBatchNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias, diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, - diopiConstTensorHandle_t runninMean, diopiConstTensorHandle_t runningVar, diopiConstTensorHandle_t saveMean, + diopiConstTensorHandle_t runningMean, diopiConstTensorHandle_t runningVar, diopiConstTensorHandle_t saveMean, diopiConstTensorHandle_t saveInvstd, bool training, double eps) { - AscendTensor inputAt(input), gradOutputAt(gradOutput), gradInputAt(gradInput); - updateInputAscendTensorDim(inputAt, training); - gradOutputAt.view(inputAt.getAclMemShape()); - gradInputAt.view(inputAt.getAclMemShape()); - - if (!training) { - batchNormBackwardTrainingUpdate(ctx, gradWeight, gradBias, gradOutputAt, inputAt, runninMean, runningVar, eps); - - AclOpRunner<3, 1>("BNInferGrad", ctx) - .addInput(gradOutputAt) - .addInput(weight) - .addInput(runningVar) - .addOutput(gradInputAt) - .setAttr("epsilon", static_cast(eps)) - .run(); - - diopiTensorHandle_t runningVarBroadcasted; - makeTensorLike(ctx, &runningVarBroadcasted, input); - AscendTensor runningVarAt(runningVar); - runningVarAt.unsqueeze(0); - runningVarAt.unsqueeze(2); - runningVarAt.unsqueeze(3); - AclOpRunner<2, 1>("BroadcastTo", ctx).addInput(runningVarAt).addConstInput(inputAt.shape()).addOutput(runningVarBroadcasted).run(); - } else { - batchNormBackwardTrainingUpdate(ctx, gradWeight, gradBias, gradOutputAt, inputAt, saveMean, saveInvstd, eps); - batchNormBackwardTrainingReduceNocheck(ctx, gradInputAt, gradWeight, gradBias, gradOutputAt, inputAt, weight, saveMean, saveInvstd, eps); + std::array gradMask = {true, true, true}; + if (nullptr == gradInput) { + gradMask[0] = false; + } + if (nullptr == gradWeight) { + gradMask[1] = false; + } + if (nullptr == gradBias) { + gradMask[2] = false; } + DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNormBackward, + ctx, + gradOutput, + input, + weight, + runningMean, + runningVar, + saveMean, + saveInvstd, + training, + eps, + gradMask, + gradInput, + gradWeight, + gradBias); return diopiSuccess; } diff --git a/impl/ascend/functions/dropout.cpp b/impl/ascend/functions/dropout.cpp index 7769ac3a1..fc54e7f86 100644 --- a/impl/ascend/functions/dropout.cpp +++ b/impl/ascend/functions/dropout.cpp @@ -23,8 +23,9 @@ diopiError_t npuDropoutOut(diopiContextHandle_t ctx, diopiTensorHandle_t out, di diopiError_t ret = diopiRequireTensor(ctx, &maskNpu, &maskSize, nullptr, diopi_dtype_uint8, diopi_device); ASCEND_CHECK_ABORT(ret == diopiSuccess, "[npuDropoutOut] require tensor for mask failed."); - uint64_t seed, offset; - DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset)); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutGenMask, ctx, inAt.shape(), p, seed, offset, maskNpu); DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutDoMask, ctx, input, maskNpu, p, out); @@ -57,8 +58,9 @@ diopiError_t npuDropout2dOut(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiError_t ret = diopiRequireTensor(ctx, &maskNpu, &maskNpuSize, nullptr, diopi_dtype_uint8, diopi_device); ASCEND_CHECK_ABORT(ret == diopiSuccess, "[npuDropout2dOut] require tensor for mask failed."); - uint64_t seed, offset; - DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset)); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutGenMask, ctx, inAt.shape(), p, seed, offset, maskNpu); DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutDoMask, ctx, input2d, maskNpu, p, out2d); diff --git a/impl/ascend/functions/group_norm.cpp b/impl/ascend/functions/group_norm.cpp index 211bea042..3cdbad08b 100644 --- a/impl/ascend/functions/group_norm.cpp +++ b/impl/ascend/functions/group_norm.cpp @@ -4,7 +4,11 @@ * @copyright (c) 2023, DeepLink. */ -#include "../common/acloprunner.hpp" +#include + +#include "../aclnn/acl_scalar.hpp" +#include "../aclnn/adaptor.hpp" +#include "../common/utils.hpp" namespace impl { namespace ascend { @@ -12,23 +16,46 @@ namespace ascend { DIOPI_API diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t saveMean, diopiTensorHandle_t saveInvstd, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, int64_t numGroups, double eps) { - if (0 == AscendTensor(input).numel()) { - AclOpRunner<1, 1>("Fills", ctx).addInput(out).setAttr("value", 0).addOutput(out).run(); + AscendTensor inputAt(input); + if (!inputAt.defined() || inputAt.numel() == 0) { return diopiSuccess; } - AclOpRunner<3, 3>("GroupNorm", ctx) - .addInput(input) - .addInput(weight) - .addInput(bias) - .setAttr("num_groups", static_cast(numGroups)) - .setAttr("epsilon", static_cast(eps)) - .setAttr("data_format", std::string{getAclDataFormat(input) == ACL_FORMAT_ND ? "ND" : "NCHW"}) - .setAttr("is_training", true) - .addOutput(out) - .addOutput(saveMean) - .addOutput(saveInvstd) - .run(); + int64_t n = inputAt.shape(0); + int64_t c = inputAt.shape(1); + int64_t hw = inputAt.numel() / (n * c); + + DIOPI_ASCEND_CALL_ACLNN(aclnnGroupNorm, ctx, input, weight, bias, n, c, hw, numGroups, eps, out, saveMean, saveInvstd); + return diopiSuccess; +} + +diopiError_t diopiGroupNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias, + diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, + diopiConstTensorHandle_t mean, diopiConstTensorHandle_t rstd, int64_t numGroups) { + AscendTensor inputAt(input); + AscendTensor gradWeightAt(gradWeight); + + if (!inputAt.defined()) { + return diopiSuccess; + } + + if (inputAt.numel() == 0) { + DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceZero, ctx, gradBias); + if (inputAt.shape(0) == 0 || inputAt.shape(1) == 0) { + DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceZero, ctx, gradWeight); + } else { + diopiScalar_t nanScalar = constructDiopiScalarT(gradWeightAt.dtype(), std::nanf("")); + DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceFillScalar, ctx, gradWeightAt, &nanScalar); + } + } else { + int64_t n = inputAt.shape(0); + int64_t c = inputAt.shape(1); + int64_t hw = inputAt.numel() / (n * c); + + std::array gradMask = {gradInput != nullptr, gradWeight != nullptr, gradBias != nullptr}; + DIOPI_ASCEND_CALL_ACLNN( + aclnnGroupNormBackward, ctx, gradOutput, inputAt, mean, rstd, weight, n, c, hw, numGroups, gradMask, gradInput, gradWeightAt, gradBias); + } return diopiSuccess; } diff --git a/impl/ascend/functions/normal.cpp b/impl/ascend/functions/normal.cpp index 236af5b0c..cb6c9f484 100644 --- a/impl/ascend/functions/normal.cpp +++ b/impl/ascend/functions/normal.cpp @@ -16,8 +16,9 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub return diopiSuccess; } - uint64_t seed, offset; - DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset)); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; float meanCast = static_cast(mean); float rstdCast = static_cast(std); @@ -26,8 +27,9 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub } diopiError_t diopiNormalInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, double mean, double std, diopiGeneratorHandle_t generator) { - uint64_t seed, offset; - DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset)); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; float meanCast = static_cast(mean); float rstdCast = static_cast(std); @@ -42,8 +44,9 @@ diopiError_t diopiNormalTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out return diopiSuccess; } - uint64_t seed, offset; - DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset)); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; DIOPI_ASCEND_CALL_ACLNN(aclnnNormalTensorTensor, ctx, mean, std, seed, offset, out); return diopiSuccess; @@ -56,8 +59,9 @@ diopiError_t diopiNormalScalarTensor(diopiContextHandle_t ctx, diopiTensorHandle return diopiSuccess; } - uint64_t seed, offset; - DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset)); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; float meanCast = static_cast(mean); DIOPI_ASCEND_CALL_ACLNN(aclnnNormalFloatTensor, ctx, meanCast, std, seed, offset, out); @@ -71,8 +75,9 @@ diopiError_t diopiNormalTensorScalar(diopiContextHandle_t ctx, diopiTensorHandle return diopiSuccess; } - uint64_t seed, offset; - DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset)); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; float rstdCast = static_cast(std); DIOPI_ASCEND_CALL_ACLNN(aclnnNormalTensorFloat, ctx, mean, rstdCast, seed, offset, out); diff --git a/impl/ascend/functions/pool.cpp b/impl/ascend/functions/pool.cpp index 708bd93a7..6b1f56ab6 100644 --- a/impl/ascend/functions/pool.cpp +++ b/impl/ascend/functions/pool.cpp @@ -4,44 +4,24 @@ * @copyright (c) 2023, DeepLink. */ -#include "../common/acloprunner.hpp" +#include "../aclnn/adaptor.hpp" namespace impl { namespace ascend { diopiError_t diopiAdaptiveAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t outputSize) { - if (0 == outputSize.data[0]) { - return diopiSuccess; + for (int64_t i = 0; i < outputSize.len; i++) { + if (outputSize.data[i] == 0) { + return diopiSuccess; + } } - AclOpRunner<1, 1>("AdaptiveAvgPool2d", ctx) - .addInput(input) - .setAttr("output_size", std::vector{static_cast(outputSize.data[0]), static_cast(outputSize.data[1])}) - .addOutput(out) - .run(); + DIOPI_ASCEND_CALL_ACLNN(aclnnAdaptiveAvgPool2d, ctx, input, outputSize, out); return diopiSuccess; } diopiError_t diopiAdaptiveAvgPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input) { - diopiSize_t shape; - diopiGetTensorShape(input, &shape); - diopiSize_t gradShape; - diopiGetTensorShape(gradOutput, &gradShape); - if (gradShape.data[gradShape.len - 1] == gradShape.data[gradShape.len - 2] && 1 == gradShape.data[gradShape.len - 1]) { - float temp = shape.data[shape.len - 1] * shape.data[shape.len - 2]; - temp = temp == 0 ? 1 : temp; - temp = 1 / temp; - diopiScalar_t scalarTemp = constructDiopiScalarT(diopi_dtype_float64, temp); - diopiFill(ctx, gradInput, &scalarTemp); - diopiMulInp(ctx, gradInput, gradOutput); - return diopiSuccess; - } - std::vector shapeVector; - shapeVector.reserve(shape.len); - for (int i = 0; i < shape.len; ++i) { - shapeVector.push_back(static_cast(shape.data[i])); - } - AclOpRunner<1, 1>("AdaptiveAvgPool2dGrad", ctx).addInput(gradOutput).setAttr("orig_input_shape", shapeVector).addOutput(gradInput).run(); + DIOPI_ASCEND_CALL_ACLNN(aclnnAdaptiveAvgPool2dBackward, ctx, gradOutput, input, gradInput); return diopiSuccess; } diff --git a/impl/ascend/functions/uniform.cpp b/impl/ascend/functions/uniform.cpp index ec898be89..fb10d69db 100644 --- a/impl/ascend/functions/uniform.cpp +++ b/impl/ascend/functions/uniform.cpp @@ -11,9 +11,9 @@ namespace impl { namespace ascend { diopiError_t diopiUniformInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, double from, double to, diopiGeneratorHandle_t generator) { - uint64_t seed = 0; - uint64_t offset = 0; - diopiGeneratorGetSeedAndOffset(generator, &seed, &offset); + const std::pair gen = getSeedAndOffset(ctx, generator, 10); + const uint64_t seed = gen.first; + const uint64_t offset = gen.second; DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceUniform, ctx, inout, from, to, seed, offset); return diopiSuccess; } diff --git a/impl/ascend_npu/ascend_config.yaml b/impl/ascend_npu/ascend_config.yaml index 7e04fc662..ceb71e244 100755 --- a/impl/ascend_npu/ascend_config.yaml +++ b/impl/ascend_npu/ascend_config.yaml @@ -1,6 +1,8 @@ ascend: - diopiAbs - diopiAbsInp +- diopiAdaptiveAvgPool2d +- diopiAdaptiveAvgPool2dBackward - diopiAddcmul - diopiAddcmulInp - diopiAddmm @@ -18,6 +20,8 @@ ascend: - diopiAtanInp - diopiBaddbmm - diopiBaddbmmInp +- diopiBatchNorm +- diopiBatchNormBackward - diopiBitwiseNot - diopiBitwiseNotInp - diopiBitwiseAnd @@ -82,6 +86,8 @@ ascend: - diopiGelu - diopiGeluBackward - diopiGeScalar +- diopiGroupNorm +- diopiGroupNormBackward - diopiGt - diopiGtInp - diopiGtInpScalar @@ -215,15 +221,9 @@ ascend: - diopiZeros ascend_npu: - diopiAdamW -- diopiAdaptiveAvgPool2d -- diopiAdaptiveAvgPool2dBackward -- diopiBatchNorm -- diopiBatchNormBackward - diopiNonzero - diopiCat - diopiCopyInp -- diopiGroupNorm -- diopiGroupNormBackward - diopiStack - diopiMultinomial - diopiRotaryEmbedding