Skip to content

Commit

Permalink
Merge branch 'DeepLink-org:main' into zq/tiny-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
NeosZhang authored Aug 30, 2024
2 parents 46b1811 + 6333a0d commit e2c2de9
Show file tree
Hide file tree
Showing 18 changed files with 451 additions and 66 deletions.
6 changes: 3 additions & 3 deletions diopi_test/diopi_stub/csrc/litert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ DIOPI_RT_API diopiError_t diopiIsTensorSparse(diopiConstTensorHandle_t th, bool*
return diopiSuccess;
}

DIOPI_RT_API diopiError_t diopiGetTensorCrowIndices(diopiConstTensorHandle_t th, diopiTensorHandle_t* crow_indices) {
DIOPI_RT_API diopiError_t diopiGetTensorCrowIndices(diopiConstTensorHandle_t th, diopiConstTensorHandle_t* crow_indices) {
diopiSparseCsrTensor* spTh = dynamic_cast<diopiSparseCsrTensor*>(const_cast<diopiTensor*>(th));
if (!spTh) {
return diopiErrorOccurred;
Expand All @@ -285,7 +285,7 @@ DIOPI_RT_API diopiError_t diopiGetTensorCrowIndices(diopiConstTensorHandle_t th,
return diopiSuccess;
}

DIOPI_RT_API diopiError_t diopiGetTensorColIndices(diopiConstTensorHandle_t th, diopiTensorHandle_t* col_indices) {
DIOPI_RT_API diopiError_t diopiGetTensorColIndices(diopiConstTensorHandle_t th, diopiConstTensorHandle_t* col_indices) {
diopiSparseCsrTensor* spTh = dynamic_cast<diopiSparseCsrTensor*>(const_cast<diopiTensor*>(th));
if (!spTh) {
return diopiErrorOccurred;
Expand All @@ -294,7 +294,7 @@ DIOPI_RT_API diopiError_t diopiGetTensorColIndices(diopiConstTensorHandle_t th,
return diopiSuccess;
}

DIOPI_RT_API diopiError_t diopiGetTensorValues(diopiConstTensorHandle_t th, diopiTensorHandle_t* values) {
DIOPI_RT_API diopiError_t diopiGetTensorValues(diopiConstTensorHandle_t th, diopiConstTensorHandle_t* values) {
diopiSparseCsrTensor* spTh = dynamic_cast<diopiSparseCsrTensor*>(const_cast<diopiTensor*>(th));
if (!spTh) {
return diopiErrorOccurred;
Expand Down
9 changes: 8 additions & 1 deletion impl/ascend/aclnn/adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,14 @@ decltype(auto) convertType(T&& param) {
}

template <class T>
void releaseConverted(T&& param [[maybe_unused]]) {} // no conversion, do nothing
struct NeedReleaseType : std::disjunction<std::is_same<std::remove_cv_t<T>, aclTensor*>, std::is_same<std::remove_cv_t<T>, aclTensorList*>,
std::is_same<std::remove_cv_t<T>, aclScalar*>, std::is_same<std::remove_cv_t<T>, aclScalarList*>,
std::is_same<std::remove_cv_t<T>, aclIntArray*>, std::is_same<std::remove_cv_t<T>, aclBoolArray*>,
std::is_same<std::remove_cv_t<T>, aclFloatArray*>> {};

// For the case that the input is not NeedReleaseType , do nothing.
template <class T, std::enable_if_t<!NeedReleaseType<T>::value, int> = 0>
void releaseConverted(T param [[maybe_unused]]) {} // no conversion, do nothing

#define IMPL_ASCEND_ACLNN_REGISTER_DESTRUCTOR(Type) \
inline void releaseConverted(const acl##Type* param) { \
Expand Down
4 changes: 4 additions & 0 deletions impl/ascend/common/acloprunner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ diopiError_t makeOnesLike(diopiContextHandle_t ctx, diopiTensorHandle_t* out, di

diopiTensorHandle_t hostToDevice(diopiContextHandle_t ctx, diopiConstTensorHandle_t src);

AscendTensor hostToDeviceAsync(diopiContextHandle_t ctx, const AscendTensor& hostTensor);

AscendTensor deviceToHostSync(diopiContextHandle_t ctx, const AscendTensor& deviceTensor);

inline std::vector<int64_t> calcStrides(int ndims, diopiSize_t size, diopiMemoryFormat_t format = diopiMemoryFormat_t::Contiguous) {
std::vector<int64_t> strides;
strides.resize(ndims);
Expand Down
43 changes: 43 additions & 0 deletions impl/ascend/common/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,49 @@ diopiTensorHandle_t hostToDevice(diopiContextHandle_t ctx, diopiConstTensorHandl
}
}

AscendTensor hostToDeviceAsync(diopiContextHandle_t ctx, const AscendTensor& hostTensor) {
diopiDevice_t device = hostTensor.device();

if (device == diopi_host) {
diopiTensorHandle_t dst;
diopiSize_t size{hostTensor.shape().data(), hostTensor.dim()};
diopiSize_t stride{hostTensor.stride().data(), (int64_t)hostTensor.stride().size()};
diopiDtype_t dtype = hostTensor.dtype();
diopiRequireTensor(ctx, &dst, &size, &stride, dtype, diopi_device);
const void* srcPtr = hostTensor.data();
void* dstPtr;
diopiGetTensorData(dst, &dstPtr);
diopiStreamHandle_t stream;
diopiGetStream(ctx, &stream);
int64_t elemsize = hostTensor.numel() * hostTensor.elemsize();
CALL_ACLRT(aclrtMemcpyAsync(dstPtr, elemsize, const_cast<void*>(srcPtr), elemsize, ACL_MEMCPY_HOST_TO_DEVICE, stream));
return AscendTensor(dst);
} else {
return hostTensor;
}
}

AscendTensor deviceToHostSync(diopiContextHandle_t ctx, const AscendTensor& deviceTensor) {
if (deviceTensor.device() == diopi_device) {
diopiTensorHandle_t dst;
diopiSize_t size{deviceTensor.shape().data(), deviceTensor.dim()};
diopiSize_t stride{deviceTensor.stride().data(), (int64_t)deviceTensor.stride().size()};
diopiDtype_t dtype = deviceTensor.dtype();
diopiRequireTensor(ctx, &dst, &size, &stride, dtype, diopi_host);
const void* srcPtr = deviceTensor.data();
void* dstPtr;
diopiGetTensorData(dst, &dstPtr);
diopiStreamHandle_t stream;
diopiGetStream(ctx, &stream);
int64_t elemsize = deviceTensor.numel() * deviceTensor.elemsize();
CALL_ACLRT(aclrtMemcpyAsync(dstPtr, elemsize, const_cast<void*>(srcPtr), elemsize, ACL_MEMCPY_DEVICE_TO_HOST, stream));
CALL_ACLRT(aclrtSynchronizeStream(stream));
return AscendTensor(dst);
} else {
return deviceTensor;
}
}

static diopiError_t choiceDtype(const std::set<diopiDtype_t>& opSupportedDtypes, diopiDtype_t* dtype) {
if (opSupportedDtypes.find(diopi_dtype_float32) != opSupportedDtypes.end()) {
*dtype = diopi_dtype_float32;
Expand Down
9 changes: 9 additions & 0 deletions impl/ascend/convert_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -479,3 +479,12 @@
- diopiMaxPool2dBackward:
tensor_dtype:
indices: (int64)->int32

- diopiBatchNormStats:
dtype: (float64)->float32

- diopiBatchNormGatherStatsWithCounts:
dtype: (float64)->float32

- diopiBatchNormBackwardReduce:
dtype: (float64)->float32
2 changes: 1 addition & 1 deletion impl/ascend/functions/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,11 +269,11 @@ diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diop
auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);

std::vector<aclTensor*> allDefinedIndices;
auto emptyTensor = createEmptyAclTensor();
for (const auto& idx : indicesExpanded) {
if (idx.defined()) {
allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));
} else {
auto emptyTensor = createEmptyAclTensor();
allDefinedIndices.push_back(emptyTensor);
}
}
Expand Down
34 changes: 34 additions & 0 deletions impl/ascend/functions/syn_batch_norm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
* @file
* @author DeepLink
* @copyright (c) 2024, DeepLink.
*/

#include "../aclnn/adaptor.hpp"

namespace impl {
namespace ascend {

diopiError_t diopiBatchNormStats(diopiContextHandle_t ctx, diopiTensorHandle_t mean, diopiTensorHandle_t invstd, diopiConstTensorHandle_t input, double eps) {
DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNormStats, ctx, input, eps, mean, invstd);
return diopiSuccess;
}

diopiError_t diopiBatchNormBackwardReduce(diopiContextHandle_t ctx, diopiTensorHandle_t sumDy, diopiTensorHandle_t sumDyXmu, diopiTensorHandle_t gradWeight,
diopiTensorHandle_t gradBias, diopiConstTensorHandle_t gradOut, diopiConstTensorHandle_t input,
diopiConstTensorHandle_t mean, diopiConstTensorHandle_t invstd, diopiConstTensorHandle_t weight, bool inputG,
bool weightG, bool biasG) {
DIOPI_ASCEND_CALL_ACLNN(
aclnnBatchNormReduceBackward, ctx, gradOut, input, mean, invstd, weight, inputG, weightG, biasG, sumDy, sumDyXmu, gradWeight, gradBias);
return diopiSuccess;
}

diopiError_t diopiBatchNormGatherStatsWithCounts(diopiContextHandle_t ctx, diopiTensorHandle_t mean, diopiTensorHandle_t invstd, diopiConstTensorHandle_t input,
diopiConstTensorHandle_t meanAll, diopiConstTensorHandle_t invstdAll, diopiTensorHandle_t runningMean,
diopiTensorHandle_t runningVar, float momentum, float eps, diopiConstTensorHandle_t counts) {
DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNormGatherStatsWithCounts, ctx, input, meanAll, invstdAll, runningMean, runningVar, momentum, eps, counts, mean, invstd);
return diopiSuccess;
}

} // namespace ascend
} // namespace impl
107 changes: 107 additions & 0 deletions impl/ascend/functions_ext/token_attention_inference.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/**
* @file
* @author DeepLink
* @copyright (c) 2024, DeepLink.
*/

#include "../aclnn/adaptor.hpp"
#include "../common/acloprunner.hpp"
#include "impl_functions.hpp"

namespace impl {
namespace ascend {

diopiError_t diopiTokenAttentionInference(diopiContextHandle_t ctx, diopiTensorHandle_t attentionOut, diopiConstTensorHandle_t q, diopiConstTensorHandle_t k,
diopiConstTensorHandle_t bLoc, diopiConstTensorHandle_t bStartLoc, diopiConstTensorHandle_t bSeqLen,
int maxInputLen) {
AscendTensor attentionOutAt(attentionOut), qAt(q), kAt(k), bLocAt(bLoc), bStartLocAt(bStartLoc), bSeqLenAt(bSeqLen);
int batch = bLocAt.shape(0);
int head = qAt.shape(1);
int dim = qAt.shape(2);
qAt = qAt.view({batch, head, 1, dim});
diopiDtype_t dtype = qAt.dtype();
diopiDevice_t device = qAt.device();

AscendTensor bSeqLenHostAt = deviceToHostSync(ctx, bSeqLenAt);
AscendTensor bStartLocHostAt = deviceToHostSync(ctx, bStartLocAt);

const int* bSeqLenAtData = reinterpret_cast<const int*>(bSeqLenHostAt.data());
const int* bStartLocAtData = reinterpret_cast<const int*>(bStartLocHostAt.data());

for (int i = 0; i < batch; i++) {
int curSeqLen = *(bSeqLenAtData + i);
int curSeqStartLoc = *(bStartLocAtData + i);
AscendTensor kLocAt, indexAt;
makeTensor(ctx, indexAt, {curSeqLen}, diopi_dtype_int32);
diopiScalar_t start = constructDiopiScalarT(diopi_dtype_int32, maxInputLen - curSeqLen);
diopiScalar_t end = constructDiopiScalarT(diopi_dtype_int32, maxInputLen);
diopiScalar_t step = constructDiopiScalarT(diopi_dtype_int32, 1);
DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &start, &end, &step, indexAt);

AscendTensor bLocAtSlice;
makeTensor(ctx, bLocAtSlice, {1, bLocAt.shape(1)}, bLocAt.dtype());

diopiScalar_t sliceIndexScalar = constructDiopiScalarT(diopi_dtype_int32, i);
AscendTensor sliceIndexAt;
makeTensorFromScalar(ctx, sliceIndexAt, &sliceIndexScalar, bLocAt.device());
DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAt, 0, sliceIndexAt, bLocAtSlice);
bLocAtSlice.view({bLocAt.shape(1)});
makeTensor(ctx, kLocAt, {curSeqLen}, bLocAt.dtype());
DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAtSlice, 0, indexAt, kLocAt);

diopiTensorHandle_t keyTmp;
diopiConstTensorHandle_t indexAtHandle = kLocAt.tensorHandle();
ascend_npu::diopiIndex(ctx, &keyTmp, k, &indexAtHandle, 1);

AscendTensor keyTmpAt(keyTmp);

keyTmpAt = keyTmpAt.unsqueeze(0);
AscendTensor keyAt;
makeTensor(ctx, keyAt, {1, head, curSeqLen, dim}, keyTmpAt.dtype());
std::vector<int64_t> dims{0, 2, 1, 3};
diopiSize_t permuteDims = vectorToDiopiSize(dims);
DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, keyTmpAt, permuteDims, keyAt);

AscendTensor outLocAt;
makeTensor(ctx, outLocAt, {curSeqLen}, diopi_dtype_int32);
diopiScalar_t startScalar = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc);
diopiScalar_t endScalar = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc + curSeqLen);
diopiScalar_t stepScalar = constructDiopiScalarT(diopi_dtype_int32, 1);
DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &startScalar, &endScalar, &stepScalar, outLocAt);

AscendTensor scalarTensor;
diopiScalar_t scalarI = constructDiopiScalarT(diopi_dtype_int64, i);
makeTensorFromScalar(ctx, scalarTensor, &scalarI, qAt.device());

diopiTensorHandle_t qIndex;
diopiConstTensorHandle_t scalarTensorHandle = scalarTensor.tensorHandle();
ascend_npu::diopiIndex(ctx, &qIndex, qAt.tensorHandle(), &scalarTensorHandle, 1);

AscendTensor qIndexAt(qIndex);

AscendTensor matmulOutAt;
makeTensor(ctx, matmulOutAt, {keyAt.shape(0), keyAt.shape(1), qIndexAt.shape(0), keyAt.shape(2)}, keyAt.dtype());
qIndexAt.unsqueeze(0);

AscendTensor keyTmp2At;
makeTensor(ctx, keyTmp2At, {keyAt.shape(0), keyAt.shape(1), keyAt.shape(3), keyAt.shape(2)}, keyAt.dtype());
dims = {0, 1, 3, 2};
permuteDims = vectorToDiopiSize(dims);
DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, keyAt, permuteDims, keyTmp2At);

DIOPI_ASCEND_CALL_ACLNN(
aclnnMatmul, ctx, qIndexAt.view({qIndexAt.shape(0), qIndexAt.shape(2), qIndexAt.shape(1), qIndexAt.shape(3)}), keyTmp2At, matmulOutAt, 0);

AscendTensor sqrtDimAt;
diopiScalar_t sqrtDim = constructDiopiScalarT(qAt.dtype(), sqrt(dim));
makeTensorFromScalar(ctx, sqrtDimAt, &sqrtDim, matmulOutAt.device());
DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceDiv, ctx, matmulOutAt, sqrtDimAt);

std::vector<AscendTensor> indices{AscendTensor(), outLocAt};
DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, attentionOutAt, indices, matmulOutAt.view({head, curSeqLen}), false, true);
}
return diopiSuccess;
}

} // namespace ascend
} // namespace impl
103 changes: 103 additions & 0 deletions impl/ascend/functions_ext/token_softmax_reducev_inference.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/**
* @file
* @author DeepLink
* @copyright (c) 2024, DeepLink.
*/

#include <vector>

#include "../aclnn/adaptor.hpp"
#include "../common/acloprunner.hpp"
#include "impl_functions.hpp"

namespace impl {
namespace ascend {

diopiError_t diopiTokenSoftmaxReduceVInference(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t logics, diopiConstTensorHandle_t v,
diopiConstTensorHandle_t bLoc, diopiConstTensorHandle_t bStartLoc, diopiConstTensorHandle_t bSeqLen,
int maxInputLen, int otherKVIndex) {
AscendTensor outAt(out), logicsAt(logics), vAt(v), bLocAt(bLoc), bStartLocAt(bStartLoc), bSeqLenAt(bSeqLen);
int batch = bLocAt.shape(0);
int head = vAt.shape(1);
int dim = vAt.shape(2);
diopiDtype_t dtype = logicsAt.dtype();
diopiDevice_t device = logicsAt.device();

AscendTensor bSeqLenHostAt = deviceToHostSync(ctx, bSeqLenAt);
AscendTensor bStartLocHostAt = deviceToHostSync(ctx, bStartLocAt);

const int* bSeqLenAtData = reinterpret_cast<const int*>(bSeqLenHostAt.data());
const int* bStartLocAtData = reinterpret_cast<const int*>(bStartLocHostAt.data());

for (int i = 0; i < batch; i++) {
int curSeqLen = *(bSeqLenAtData + i);
int curSeqStartLoc = *(bStartLocAtData + i);
AscendTensor indexAt;
makeTensor(ctx, indexAt, {curSeqLen}, diopi_dtype_int32);
diopiScalar_t start = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc);
diopiScalar_t end = constructDiopiScalarT(diopi_dtype_int32, curSeqStartLoc + curSeqLen);
diopiScalar_t step = constructDiopiScalarT(diopi_dtype_int32, 1);
DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &start, &end, &step, indexAt);

diopiTensorHandle_t indexOut;
diopiConstTensorHandle_t indices[2] = {diopiConstTensorHandle_t(), indexAt.tensorHandle()};
ascend_npu::diopiIndex(ctx, &indexOut, logicsAt.tensorHandle(), indices, 2);
AscendTensor indexOutAt(indexOut);

AscendTensor softmaxOutAt;
makeTensor(ctx, softmaxOutAt, indexOutAt.shape(), indexOutAt.dtype());
DIOPI_ASCEND_CALL_ACLNN(aclnnSoftmax, ctx, indexOutAt, indexOutAt.dim() - 1, softmaxOutAt);

softmaxOutAt = softmaxOutAt.view({head, 1, 1, curSeqLen});
AscendTensor pAt;
makeTensor(ctx, pAt, {softmaxOutAt.shape(1), softmaxOutAt.shape(0), softmaxOutAt.shape(2), softmaxOutAt.shape(3)}, logicsAt.dtype());
std::vector<int64_t> dims{1, 0, 2, 3};
diopiSize_t permuteDims = vectorToDiopiSize(dims);
DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, softmaxOutAt, permuteDims, pAt);

makeTensor(ctx, indexAt, {curSeqLen}, diopi_dtype_int32);
diopiScalar_t startVLoc = constructDiopiScalarT(diopi_dtype_int32, maxInputLen - curSeqLen);
diopiScalar_t endVLoc = constructDiopiScalarT(diopi_dtype_int32, maxInputLen);
diopiScalar_t stepvLoc = constructDiopiScalarT(diopi_dtype_int32, 1);
DIOPI_ASCEND_CALL_ACLNN(aclnnArange, ctx, &startVLoc, &endVLoc, &stepvLoc, indexAt);

AscendTensor bLocAtSlice;
makeTensor(ctx, bLocAtSlice, {1, bLocAt.shape(1)}, bLocAt.dtype());
diopiScalar_t sliceIndexScalar = constructDiopiScalarT(diopi_dtype_int32, i);
AscendTensor sliceIndexAt;
makeTensorFromScalar(ctx, sliceIndexAt, &sliceIndexScalar, bLocAt.device());
DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAt, 0, sliceIndexAt, bLocAtSlice);
bLocAtSlice.view({bLocAt.shape(1)});

AscendTensor vLocAt;
makeTensor(ctx, vLocAt, {curSeqLen}, bLocAt.dtype());
DIOPI_ASCEND_CALL_ACLNN(aclnnIndexSelect, ctx, bLocAtSlice, 0, indexAt, vLocAt);

diopiTensorHandle_t vIndexOut;
diopiConstTensorHandle_t indexAtHandle = vLocAt.tensorHandle();
ascend_npu::diopiIndex(ctx, &vIndexOut, vAt.tensorHandle(), &indexAtHandle, 1);

AscendTensor vIndexOutAt(vIndexOut);
vIndexOutAt = vIndexOutAt.view({1, curSeqLen, head, dim});

AscendTensor vAt;
makeTensor(ctx, vAt, {1, head, curSeqLen, dim}, vIndexOutAt.dtype());
dims = {0, 2, 1, 3};
permuteDims = vectorToDiopiSize(dims);
DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, vIndexOutAt, permuteDims, vAt);

AscendTensor matmulOutAt;
makeTensor(ctx, matmulOutAt, {pAt.shape(0), pAt.shape(1), pAt.shape(2), vAt.shape(3)}, pAt.dtype());
DIOPI_ASCEND_CALL_ACLNN(aclnnMatmul, ctx, pAt, vAt, matmulOutAt, 0);

diopiScalar_t scalarI = constructDiopiScalarT(diopi_dtype_int32, i);
AscendTensor tensorI;
makeTensorFromScalar(ctx, tensorI, &scalarI, matmulOutAt.device());
std::vector<AscendTensor> indexPutIndices{tensorI};
DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, outAt, indexPutIndices, matmulOutAt.view({head, dim}), false, true);
}
return diopiSuccess;
}

} // namespace ascend
} // namespace impl
3 changes: 3 additions & 0 deletions impl/ascend_npu/CMakeLists.txt
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ set(OLD_IMPL_SRC
${OLD_IMPL_DIR}/functions/equal.cpp
${OLD_IMPL_DIR}/functions/masked_select.cpp
${OLD_IMPL_DIR}/functions/unique.cpp
${OLD_IMPL_DIR}/functions/syn_batch_norm.cpp
${OLD_IMPL_DIR}/functions_mmcv/roi_align_npu.cpp
${OLD_IMPL_DIR}/functions_ext/rms_norm.cpp
${OLD_IMPL_DIR}/functions_ext/rotary_embedding.cpp
Expand All @@ -205,6 +206,8 @@ set(OLD_IMPL_SRC
${OLD_IMPL_DIR}/functions_ext/prompt_flash_attention.cpp
${OLD_IMPL_DIR}/functions_ext/paged_attention.cpp
${OLD_IMPL_DIR}/functions_ext/matmul_all_reduce.cpp
${OLD_IMPL_DIR}/functions_ext/token_attention_inference.cpp
${OLD_IMPL_DIR}/functions_ext/token_softmax_reducev_inference.cpp
#${OLD_IMPL_DIR}/test/export_functions.cpp
#${OLD_IMPL_DIR}/test/conform_test.cpp
${OLD_IMPL_DIR}/common/utils.cpp
Expand Down
Loading

0 comments on commit e2c2de9

Please sign in to comment.