From 28c6fdcf13e297c74f8538cd5e180b09d2095706 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=84=E5=AE=87=E6=89=AC?= <huang.yuyang@think-force.com>
Date: Thu, 8 Aug 2024 19:06:53 +0800
Subject: [PATCH] fix

---
 src/devices/cuda/fastllm-cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/devices/cuda/fastllm-cuda.cu b/src/devices/cuda/fastllm-cuda.cu
index 69c8e7e..a3af28e 100644
--- a/src/devices/cuda/fastllm-cuda.cu
+++ b/src/devices/cuda/fastllm-cuda.cu
@@ -1102,6 +1102,7 @@ __global__ void FastllmLayerNormKernelTop1(float *input, float *output, int chan
     float *inputData = input + blockIdx.x * channels;
     float *outputData = output + blockIdx.x * 2;
     int tid = threadIdx.x;
+    idData[tid] = tid;
     maxData[tid] = -1e100;
     for (int j = tid; j < channels; j += THREAD_PER_BLOCK) {
         if (inputData[j] > maxData[tid]) {
@@ -1134,6 +1135,7 @@ __global__ void FastllmLayerNormKernelTopK(float *input, float *output, int K, i
     float *inputData = input + blockIdx.x * channels;
     float *outputData = output + blockIdx.x * 2 * K;
     int tid = threadIdx.x;
+    idData[tid][0] = tid;
     for (int i = 0; i < K; i++) {
         maxData[tid][i] = -1e100;
     }