Merge branch 'DeepLink-org:main' into zq/tiny-fix

DeepLink-org · Aug 30, 2024 · e2c2de9 · e2c2de9
2 parents 46b1811 + 6333a0d
commit e2c2de9
Show file tree

Hide file tree

Showing 18 changed files with 451 additions and 66 deletions.
diff --git a/diopi_test/diopi_stub/csrc/litert.cpp b/diopi_test/diopi_stub/csrc/litert.cpp
@@ -276,7 +276,7 @@ DIOPI_RT_API diopiError_t diopiIsTensorSparse(diopiConstTensorHandle_t th, bool*
     return diopiSuccess;
 }
 
-DIOPI_RT_API diopiError_t diopiGetTensorCrowIndices(diopiConstTensorHandle_t th, diopiTensorHandle_t* crow_indices) {
+DIOPI_RT_API diopiError_t diopiGetTensorCrowIndices(diopiConstTensorHandle_t th, diopiConstTensorHandle_t* crow_indices) {
     diopiSparseCsrTensor* spTh = dynamic_cast<diopiSparseCsrTensor*>(const_cast<diopiTensor*>(th));
     if (!spTh) {
         return diopiErrorOccurred;
@@ -285,7 +285,7 @@ DIOPI_RT_API diopiError_t diopiGetTensorCrowIndices(diopiConstTensorHandle_t th,
     return diopiSuccess;
 }
 
-DIOPI_RT_API diopiError_t diopiGetTensorColIndices(diopiConstTensorHandle_t th, diopiTensorHandle_t* col_indices) {
+DIOPI_RT_API diopiError_t diopiGetTensorColIndices(diopiConstTensorHandle_t th, diopiConstTensorHandle_t* col_indices) {
     diopiSparseCsrTensor* spTh = dynamic_cast<diopiSparseCsrTensor*>(const_cast<diopiTensor*>(th));
     if (!spTh) {
         return diopiErrorOccurred;
@@ -294,7 +294,7 @@ DIOPI_RT_API diopiError_t diopiGetTensorColIndices(diopiConstTensorHandle_t th,
     return diopiSuccess;
 }
 
-DIOPI_RT_API diopiError_t diopiGetTensorValues(diopiConstTensorHandle_t th, diopiTensorHandle_t* values) {
+DIOPI_RT_API diopiError_t diopiGetTensorValues(diopiConstTensorHandle_t th, diopiConstTensorHandle_t* values) {
     diopiSparseCsrTensor* spTh = dynamic_cast<diopiSparseCsrTensor*>(const_cast<diopiTensor*>(th));
     if (!spTh) {
         return diopiErrorOccurred;

diff --git a/impl/ascend/aclnn/adaptor.hpp b/impl/ascend/aclnn/adaptor.hpp
@@ -210,7 +210,14 @@ decltype(auto) convertType(T&& param) {
 }
 
 template <class T>
-void releaseConverted(T&& param [[maybe_unused]]) {}  // no conversion, do nothing
+struct NeedReleaseType : std::disjunction<std::is_same<std::remove_cv_t<T>, aclTensor*>, std::is_same<std::remove_cv_t<T>, aclTensorList*>,
+                                          std::is_same<std::remove_cv_t<T>, aclScalar*>, std::is_same<std::remove_cv_t<T>, aclScalarList*>,
+                                          std::is_same<std::remove_cv_t<T>, aclIntArray*>, std::is_same<std::remove_cv_t<T>, aclBoolArray*>,
+                                          std::is_same<std::remove_cv_t<T>, aclFloatArray*>> {};
+
+// For the case that the input is not NeedReleaseType , do nothing.
+template <class T, std::enable_if_t<!NeedReleaseType<T>::value, int> = 0>
+void releaseConverted(T param [[maybe_unused]]) {}  // no conversion, do nothing
 
 #define IMPL_ASCEND_ACLNN_REGISTER_DESTRUCTOR(Type)        \
     inline void releaseConverted(const acl##Type* param) { \

diff --git a/impl/ascend/common/acloprunner.hpp b/impl/ascend/common/acloprunner.hpp
@@ -64,6 +64,10 @@ diopiError_t makeOnesLike(diopiContextHandle_t ctx, diopiTensorHandle_t* out, di
 
 diopiTensorHandle_t hostToDevice(diopiContextHandle_t ctx, diopiConstTensorHandle_t src);
 
+AscendTensor hostToDeviceAsync(diopiContextHandle_t ctx, const AscendTensor& hostTensor);
+
+AscendTensor deviceToHostSync(diopiContextHandle_t ctx, const AscendTensor& deviceTensor);
+
 inline std::vector<int64_t> calcStrides(int ndims, diopiSize_t size, diopiMemoryFormat_t format = diopiMemoryFormat_t::Contiguous) {
     std::vector<int64_t> strides;
     strides.resize(ndims);

diff --git a/impl/ascend/common/utils.cpp b/impl/ascend/common/utils.cpp
@@ -692,6 +692,49 @@ diopiTensorHandle_t hostToDevice(diopiContextHandle_t ctx, diopiConstTensorHandl
     }
 }
 
+AscendTensor hostToDeviceAsync(diopiContextHandle_t ctx, const AscendTensor& hostTensor) {
+    diopiDevice_t device = hostTensor.device();
+
+    if (device == diopi_host) {
+        diopiTensorHandle_t dst;
+        diopiSize_t size{hostTensor.shape().data(), hostTensor.dim()};
+        diopiSize_t stride{hostTensor.stride().data(), (int64_t)hostTensor.stride().size()};
+        diopiDtype_t dtype = hostTensor.dtype();
+        diopiRequireTensor(ctx, &dst, &size, &stride, dtype, diopi_device);
+        const void* srcPtr = hostTensor.data();
+        void* dstPtr;
+        diopiGetTensorData(dst, &dstPtr);
+        diopiStreamHandle_t stream;
+        diopiGetStream(ctx, &stream);
+        int64_t elemsize = hostTensor.numel() * hostTensor.elemsize();
+        CALL_ACLRT(aclrtMemcpyAsync(dstPtr, elemsize, const_cast<void*>(srcPtr), elemsize, ACL_MEMCPY_HOST_TO_DEVICE, stream));
+        return AscendTensor(dst);
+    } else {
+        return hostTensor;
+    }
+}
+
+AscendTensor deviceToHostSync(diopiContextHandle_t ctx, const AscendTensor& deviceTensor) {
+    if (deviceTensor.device() == diopi_device) {
+        diopiTensorHandle_t dst;
+        diopiSize_t size{deviceTensor.shape().data(), deviceTensor.dim()};
+        diopiSize_t stride{deviceTensor.stride().data(), (int64_t)deviceTensor.stride().size()};
+        diopiDtype_t dtype = deviceTensor.dtype();
+        diopiRequireTensor(ctx, &dst, &size, &stride, dtype, diopi_host);
+        const void* srcPtr = deviceTensor.data();
+        void* dstPtr;
+        diopiGetTensorData(dst, &dstPtr);
+        diopiStreamHandle_t stream;
+        diopiGetStream(ctx, &stream);
+        int64_t elemsize = deviceTensor.numel() * deviceTensor.elemsize();
+        CALL_ACLRT(aclrtMemcpyAsync(dstPtr, elemsize, const_cast<void*>(srcPtr), elemsize, ACL_MEMCPY_DEVICE_TO_HOST, stream));
+        CALL_ACLRT(aclrtSynchronizeStream(stream));
+        return AscendTensor(dst);
+    } else {
+        return deviceTensor;
+    }
+}
+
 static diopiError_t choiceDtype(const std::set<diopiDtype_t>& opSupportedDtypes, diopiDtype_t* dtype) {
     if (opSupportedDtypes.find(diopi_dtype_float32) != opSupportedDtypes.end()) {
         *dtype = diopi_dtype_float32;

diff --git a/impl/ascend/convert_config.yaml b/impl/ascend/convert_config.yaml
@@ -479,3 +479,12 @@
 - diopiMaxPool2dBackward:
     tensor_dtype:
         indices: (int64)->int32
+
+- diopiBatchNormStats:
+    dtype: (float64)->float32
+
+- diopiBatchNormGatherStatsWithCounts:
+    dtype: (float64)->float32
+
+- diopiBatchNormBackwardReduce:
+    dtype: (float64)->float32
diff --git a/impl/ascend/functions/index.cpp b/impl/ascend/functions/index.cpp
@@ -269,11 +269,11 @@ diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diop
     auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);
 
     std::vector<aclTensor*> allDefinedIndices;
-    auto emptyTensor = createEmptyAclTensor();
     for (const auto& idx : indicesExpanded) {
         if (idx.defined()) {
             allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));
         } else {
+            auto emptyTensor = createEmptyAclTensor();
             allDefinedIndices.push_back(emptyTensor);
         }
     }

diff --git a/impl/ascend/functions/syn_batch_norm.cpp b/impl/ascend/functions/syn_batch_norm.cpp
@@ -0,0 +1,34 @@
+/**
+ * @file
+ * @author DeepLink
+ * @copyright  (c) 2024, DeepLink.
+ */
+
+#include "../aclnn/adaptor.hpp"
+
+namespace impl {
+namespace ascend {
+
+diopiError_t diopiBatchNormStats(diopiContextHandle_t ctx, diopiTensorHandle_t mean, diopiTensorHandle_t invstd, diopiConstTensorHandle_t input, double eps) {
+    DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNormStats, ctx, input, eps, mean, invstd);
+    return diopiSuccess;
+}
+
+diopiError_t diopiBatchNormBackwardReduce(diopiContextHandle_t ctx, diopiTensorHandle_t sumDy, diopiTensorHandle_t sumDyXmu, diopiTensorHandle_t gradWeight,
+                                          diopiTensorHandle_t gradBias, diopiConstTensorHandle_t gradOut, diopiConstTensorHandle_t input,
+                                          diopiConstTensorHandle_t mean, diopiConstTensorHandle_t invstd, diopiConstTensorHandle_t weight, bool inputG,
+                                          bool weightG, bool biasG) {
+    DIOPI_ASCEND_CALL_ACLNN(
+        aclnnBatchNormReduceBackward, ctx, gradOut, input, mean, invstd, weight, inputG, weightG, biasG, sumDy, sumDyXmu, gradWeight, gradBias);
+    return diopiSuccess;
+}
+
+diopiError_t diopiBatchNormGatherStatsWithCounts(diopiContextHandle_t ctx, diopiTensorHandle_t mean, diopiTensorHandle_t invstd, diopiConstTensorHandle_t input,
+                                                 diopiConstTensorHandle_t meanAll, diopiConstTensorHandle_t invstdAll, diopiTensorHandle_t runningMean,
+                                                 diopiTensorHandle_t runningVar, float momentum, float eps, diopiConstTensorHandle_t counts) {
+    DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNormGatherStatsWithCounts, ctx, input, meanAll, invstdAll, runningMean, runningVar, momentum, eps, counts, mean, invstd);
+    return diopiSuccess;
+}
+
+}  // namespace ascend
+}  // namespace impl
diff --git a/impl/ascend/functions_ext/token_attention_inference.cpp b/impl/ascend/functions_ext/token_attention_inference.cpp
@@ -0,0 +1,107 @@
+/**
+ * @file
+ * @author DeepLink
+ * @copyright  (c) 2024, DeepLink.
+ */
+
+#include "../aclnn/adaptor.hpp"
+#include "../common/acloprunner.hpp"
+#include "impl_functions.hpp"
+
+namespace impl {
+namespace ascend {
+
+diopiError_t diopiTokenAttentionInference(diopiContextHandle_t ctx, diopiTensorHandle_t attentionOut, diopiConstTensorHandle_t q, diopiConstTensorHandle_t k,
+                                          diopiConstTensorHandle_t bLoc, diopiConstTensorHandle_t bStartLoc, diopiConstTensorHandle_t bSeqLen,
+                                          int maxInputLen) {
+    AscendTensor attentionOutAt(attentionOut), qAt(q), kAt(k), bLocAt(bLoc), bStartLocAt(bStartLoc), bSeqLenAt(bSeqLen);
+    int batch = bLocAt.shape(0);
+    int head = qAt.shape(1);
+    int dim = qAt.shape(2);
+    qAt = qAt.view({batch, head, 1, dim});
+    diopiDtype_t dtype = qAt.dtype();
+    diopiDevice_t device = qAt.device();
+
+    AscendTensor bSeqLenHostAt = deviceToHostSync(ctx, bSeqLenAt);
+    AscendTensor bStartLocHostAt = deviceToHostSync(ctx, bStartLocAt);
+
+    const int* bSeqLenAtData = reinterpret_cast<const int*>(bSeqLenHostAt.data());
+    const int* bStartLocAtData = reinterpret_cast<const int*>(bStartLocHostAt.data());
+
+    for (int i = 0; i < batch; i++) {
+        int curSeqLen = *(bSeqLenAtData + i);
+        int curSeqStartLoc = *(bStartLocAtData + i);
+        AscendTensor kLocAt, indexAt;
+        makeTensor(ctx, indexAt, {curSeqLen}, diopi_dtype_int32);
+        diopiScalar_t start = constructDiopiScalarT(diopi_dtype_int32, maxInputLen - curSeqLen);
+        diopiScalar_t end = constructDiopiScalarT(diopi_dtype_int32, maxInputLen);
+        diopiScalar_t step = constructDiopiScalarT(diopi_dtype_int32, 1);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &start, &end, &step, indexAt);
+
+        AscendTensor bLocAtSlice;
+        makeTensor(ctx, bLocAtSlice, {1, bLocAt.shape(1)}, bLocAt.dtype());
+
+        diopiScalar_t sliceIndexScalar = constructDiopiScalarT(diopi_dtype_int32, i);
+        AscendTensor sliceIndexAt;
+        makeTensorFromScalar(ctx, sliceIndexAt, &sliceIndexScalar, bLocAt.device());
+        DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAt, 0, sliceIndexAt, bLocAtSlice);
+        bLocAtSlice.view({bLocAt.shape(1)});
+        makeTensor(ctx, kLocAt, {curSeqLen}, bLocAt.dtype());
+        DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAtSlice, 0, indexAt, kLocAt);
+
+        diopiTensorHandle_t keyTmp;
+        diopiConstTensorHandle_t indexAtHandle = kLocAt.tensorHandle();
+        ascend_npu::diopiIndex(ctx, &keyTmp, k, &indexAtHandle, 1);
+
+        AscendTensor keyTmpAt(keyTmp);
+
+        keyTmpAt = keyTmpAt.unsqueeze(0);
+        AscendTensor keyAt;
+        makeTensor(ctx, keyAt, {1, head, curSeqLen, dim}, keyTmpAt.dtype());
+        std::vector<int64_t> dims{0, 2, 1, 3};
+        diopiSize_t permuteDims = vectorToDiopiSize(dims);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, keyTmpAt, permuteDims, keyAt);
+
+        AscendTensor outLocAt;
+        makeTensor(ctx, outLocAt, {curSeqLen}, diopi_dtype_int32);
+        diopiScalar_t startScalar = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc);
+        diopiScalar_t endScalar = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc + curSeqLen);
+        diopiScalar_t stepScalar = constructDiopiScalarT(diopi_dtype_int32, 1);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &startScalar, &endScalar, &stepScalar, outLocAt);
+
+        AscendTensor scalarTensor;
+        diopiScalar_t scalarI = constructDiopiScalarT(diopi_dtype_int64, i);
+        makeTensorFromScalar(ctx, scalarTensor, &scalarI, qAt.device());
+
+        diopiTensorHandle_t qIndex;
+        diopiConstTensorHandle_t scalarTensorHandle = scalarTensor.tensorHandle();
+        ascend_npu::diopiIndex(ctx, &qIndex, qAt.tensorHandle(), &scalarTensorHandle, 1);
+
+        AscendTensor qIndexAt(qIndex);
+
+        AscendTensor matmulOutAt;
+        makeTensor(ctx, matmulOutAt, {keyAt.shape(0), keyAt.shape(1), qIndexAt.shape(0), keyAt.shape(2)}, keyAt.dtype());
+        qIndexAt.unsqueeze(0);
+
+        AscendTensor keyTmp2At;
+        makeTensor(ctx, keyTmp2At, {keyAt.shape(0), keyAt.shape(1), keyAt.shape(3), keyAt.shape(2)}, keyAt.dtype());
+        dims = {0, 1, 3, 2};
+        permuteDims = vectorToDiopiSize(dims);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, keyAt, permuteDims, keyTmp2At);
+
+        DIOPI_ASCEND_CALL_ACLNN(
+            aclnnMatmul, ctx, qIndexAt.view({qIndexAt.shape(0), qIndexAt.shape(2), qIndexAt.shape(1), qIndexAt.shape(3)}), keyTmp2At, matmulOutAt, 0);
+
+        AscendTensor sqrtDimAt;
+        diopiScalar_t sqrtDim = constructDiopiScalarT(qAt.dtype(), sqrt(dim));
+        makeTensorFromScalar(ctx, sqrtDimAt, &sqrtDim, matmulOutAt.device());
+        DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceDiv, ctx, matmulOutAt, sqrtDimAt);
+
+        std::vector<AscendTensor> indices{AscendTensor(), outLocAt};
+        DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, attentionOutAt, indices, matmulOutAt.view({head, curSeqLen}), false, true);
+    }
+    return diopiSuccess;
+}
+
+}  // namespace ascend
+}  // namespace impl
diff --git a/impl/ascend/functions_ext/token_softmax_reducev_inference.cpp b/impl/ascend/functions_ext/token_softmax_reducev_inference.cpp
@@ -0,0 +1,103 @@
+/**
+ * @file
+ * @author DeepLink
+ * @copyright  (c) 2024, DeepLink.
+ */
+
+#include <vector>
+
+#include "../aclnn/adaptor.hpp"
+#include "../common/acloprunner.hpp"
+#include "impl_functions.hpp"
+
+namespace impl {
+namespace ascend {
+
+diopiError_t diopiTokenSoftmaxReduceVInference(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t logics, diopiConstTensorHandle_t v,
+                                               diopiConstTensorHandle_t bLoc, diopiConstTensorHandle_t bStartLoc, diopiConstTensorHandle_t bSeqLen,
+                                               int maxInputLen, int otherKVIndex) {
+    AscendTensor outAt(out), logicsAt(logics), vAt(v), bLocAt(bLoc), bStartLocAt(bStartLoc), bSeqLenAt(bSeqLen);
+    int batch = bLocAt.shape(0);
+    int head = vAt.shape(1);
+    int dim = vAt.shape(2);
+    diopiDtype_t dtype = logicsAt.dtype();
+    diopiDevice_t device = logicsAt.device();
+
+    AscendTensor bSeqLenHostAt = deviceToHostSync(ctx, bSeqLenAt);
+    AscendTensor bStartLocHostAt = deviceToHostSync(ctx, bStartLocAt);
+
+    const int* bSeqLenAtData = reinterpret_cast<const int*>(bSeqLenHostAt.data());
+    const int* bStartLocAtData = reinterpret_cast<const int*>(bStartLocHostAt.data());
+
+    for (int i = 0; i < batch; i++) {
+        int curSeqLen = *(bSeqLenAtData + i);
+        int curSeqStartLoc = *(bStartLocAtData + i);
+        AscendTensor indexAt;
+        makeTensor(ctx, indexAt, {curSeqLen}, diopi_dtype_int32);
+        diopiScalar_t start = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc);
+        diopiScalar_t end = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc + curSeqLen);
+        diopiScalar_t step = constructDiopiScalarT(diopi_dtype_int32, 1);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &start, &end, &step, indexAt);
+
+        diopiTensorHandle_t indexOut;
+        diopiConstTensorHandle_t indices[2] = {diopiConstTensorHandle_t(), indexAt.tensorHandle()};
+        ascend_npu::diopiIndex(ctx, &indexOut, logicsAt.tensorHandle(), indices, 2);
+        AscendTensor indexOutAt(indexOut);
+
+        AscendTensor softmaxOutAt;
+        makeTensor(ctx, softmaxOutAt, indexOutAt.shape(), indexOutAt.dtype());
+        DIOPI_ASCEND_CALL_ACLNN(aclnnSoftmax, ctx, indexOutAt, indexOutAt.dim() - 1, softmaxOutAt);
+
+        softmaxOutAt = softmaxOutAt.view({head, 1, 1, curSeqLen});
+        AscendTensor pAt;
+        makeTensor(ctx, pAt, {softmaxOutAt.shape(1), softmaxOutAt.shape(0), softmaxOutAt.shape(2), softmaxOutAt.shape(3)}, logicsAt.dtype());
+        std::vector<int64_t> dims{1, 0, 2, 3};
+        diopiSize_t permuteDims = vectorToDiopiSize(dims);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, softmaxOutAt, permuteDims, pAt);
+
+        makeTensor(ctx, indexAt, {curSeqLen}, diopi_dtype_int32);
+        diopiScalar_t startVLoc = constructDiopiScalarT(diopi_dtype_int32, maxInputLen - curSeqLen);
+        diopiScalar_t endVLoc = constructDiopiScalarT(diopi_dtype_int32, maxInputLen);
+        diopiScalar_t stepvLoc = constructDiopiScalarT(diopi_dtype_int32, 1);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &startVLoc, &endVLoc, &stepvLoc, indexAt);
+
+        AscendTensor bLocAtSlice;
+        makeTensor(ctx, bLocAtSlice, {1, bLocAt.shape(1)}, bLocAt.dtype());
+        diopiScalar_t sliceIndexScalar = constructDiopiScalarT(diopi_dtype_int32, i);
+        AscendTensor sliceIndexAt;
+        makeTensorFromScalar(ctx, sliceIndexAt, &sliceIndexScalar, bLocAt.device());
+        DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAt, 0, sliceIndexAt, bLocAtSlice);
+        bLocAtSlice.view({bLocAt.shape(1)});
+
+        AscendTensor vLocAt;
+        makeTensor(ctx, vLocAt, {curSeqLen}, bLocAt.dtype());
+        DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAtSlice, 0, indexAt, vLocAt);
+
+        diopiTensorHandle_t vIndexOut;
+        diopiConstTensorHandle_t indexAtHandle = vLocAt.tensorHandle();
+        ascend_npu::diopiIndex(ctx, &vIndexOut, vAt.tensorHandle(), &indexAtHandle, 1);
+
+        AscendTensor vIndexOutAt(vIndexOut);
+        vIndexOutAt = vIndexOutAt.view({1, curSeqLen, head, dim});
+
+        AscendTensor vAt;
+        makeTensor(ctx, vAt, {1, head, curSeqLen, dim}, vIndexOutAt.dtype());
+        dims = {0, 2, 1, 3};
+        permuteDims = vectorToDiopiSize(dims);
+        DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, vIndexOutAt, permuteDims, vAt);
+
+        AscendTensor matmulOutAt;
+        makeTensor(ctx, matmulOutAt, {pAt.shape(0), pAt.shape(1), pAt.shape(2), vAt.shape(3)}, pAt.dtype());
+        DIOPI_ASCEND_CALL_ACLNN(aclnnMatmul, ctx, pAt, vAt, matmulOutAt, 0);
+
+        diopiScalar_t scalarI = constructDiopiScalarT(diopi_dtype_int32, i);
+        AscendTensor tensorI;
+        makeTensorFromScalar(ctx, tensorI, &scalarI, matmulOutAt.device());
+        std::vector<AscendTensor> indexPutIndices{tensorI};
+        DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, outAt, indexPutIndices, matmulOutAt.view({head, dim}), false, true);
+    }
+    return diopiSuccess;
+}
+
+}  // namespace ascend
+}  // namespace impl
diff --git a/impl/ascend_npu/CMakeLists.txt b/impl/ascend_npu/CMakeLists.txt
@@ -194,6 +194,7 @@ set(OLD_IMPL_SRC
     ${OLD_IMPL_DIR}/functions/equal.cpp
     ${OLD_IMPL_DIR}/functions/masked_select.cpp
     ${OLD_IMPL_DIR}/functions/unique.cpp
+    ${OLD_IMPL_DIR}/functions/syn_batch_norm.cpp
     ${OLD_IMPL_DIR}/functions_mmcv/roi_align_npu.cpp
     ${OLD_IMPL_DIR}/functions_ext/rms_norm.cpp
     ${OLD_IMPL_DIR}/functions_ext/rotary_embedding.cpp
@@ -205,6 +206,8 @@ set(OLD_IMPL_SRC
     ${OLD_IMPL_DIR}/functions_ext/prompt_flash_attention.cpp
     ${OLD_IMPL_DIR}/functions_ext/paged_attention.cpp
     ${OLD_IMPL_DIR}/functions_ext/matmul_all_reduce.cpp
+    ${OLD_IMPL_DIR}/functions_ext/token_attention_inference.cpp
+    ${OLD_IMPL_DIR}/functions_ext/token_softmax_reducev_inference.cpp
     #${OLD_IMPL_DIR}/test/export_functions.cpp
     #${OLD_IMPL_DIR}/test/conform_test.cpp
     ${OLD_IMPL_DIR}/common/utils.cpp