增加一个函数判断是否需要生成attentionMask

ztxz16 · Jun 15, 2024 · 1399b63 · 1399b63
1 parent 9afe60c
commit 1399b63
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 8 deletions.
diff --git a/include/models/basellm.h b/include/models/basellm.h
@@ -125,6 +125,9 @@ namespace fastllm {
                 const LastTokensManager &lastTokens = LastTokensManager(),
                 std::vector <std::vector <float>*> *logits = nullptr);
 
+        // 是否需要生成AttentionMask
+        virtual bool NeedAttentionMask(int qlen, int klen);
+
         // 根据输入的tokens生成LLM推理的输入 
         virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
                                    const std::map <std::string, int> &params,

diff --git a/include/models/llama.h b/include/models/llama.h
@@ -56,6 +56,9 @@ namespace fastllm {
                 const std::vector <GenerationConfig> &generationConfigs,
                 const LastTokensManager &lastTokens = LastTokensManager(),
                 std::vector <std::vector <float>*> *logits = nullptr);
+
+        // 是否需要生成AttentionMask
+        virtual bool NeedAttentionMask(int qlen, int klen);
 
         // 根据输入的tokens生成LLM推理的输入
         virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,

diff --git a/src/devices/cpu/cpudevice.cpp b/src/devices/cpu/cpudevice.cpp
@@ -275,13 +275,18 @@ namespace fastllm {
         void Run() {
             float *qk = new float[k1];
             float *temp = new float[k1];
+            int base = k1 - q1;
             for (int i = 0; i < q1; i++) {
                 float maxValue = -10000, sum = 0.0;
                 for (int j = 0; j < k1; j++) {
                     if (maskd && maskd[i * k1 + j] > 0.99) {
                         qk[j] = -10000;
                         continue;
                     }
+                    if (!maskd && (base + i) < j) {
+                        qk[j] = -10000;
+                        continue;
+                    }
                     float now = 0.0f;
                     int l = 0;
 #ifdef __aarch64__

diff --git a/src/models/basellm.cpp b/src/models/basellm.cpp
@@ -890,6 +890,10 @@ printf("len = %d, spend = %f s. tokens / s = %f\n", (int)total, spend, (float)to
         }
     }
 
+    bool basellm::NeedAttentionMask(int qlen, int klen) {
+        return true;
+    }
+
     // 根据输入的tokens生成LLM推理的输入
     void basellm::FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
                                const std::map <std::string, int> &params,
@@ -903,18 +907,25 @@ printf("len = %d, spend = %f s. tokens / s = %f\n", (int)total, spend, (float)to
 
         if (inputTokens[0].size() > 1) {
             int seqLen = inputTokens[0].size();
-
-            std::vector <float> vmask = std::vector <float> (seqLen * promptLen, 0);
             std::vector <float> vpids = std::vector <float> (seqLen, 0);
             for (int i = 0; i < seqLen; i++) {
                 vpids[i] = promptLen - seqLen + i;
-                for (int j = i + 1; j < seqLen; j++) {
-                    vmask[i * promptLen + (promptLen - seqLen + j)] = 1;
-                }
             }
             inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, inputTokens[0]));
-            attentionMask.CopyFrom(Data(DataType::FLOAT32, {seqLen, promptLen}, vmask));
             positionIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, vpids));
+
+            if (NeedAttentionMask(seqLen, promptLen)) {
+                std::vector <float> vmask = std::vector <float> (seqLen * promptLen, 0);
+                for (int i = 0; i < seqLen; i++) {
+                    vpids[i] = promptLen - seqLen + i;
+                    for (int j = i + 1; j < seqLen; j++) {
+                        vmask[i * promptLen + (promptLen - seqLen + j)] = 1;
+                    }
+                }
+                attentionMask.CopyFrom(Data(DataType::FLOAT32, {seqLen, promptLen}, vmask));
+            } else {
+                attentionMask = Data();
+            }
         } else {
             inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, inputTokens[0]));
             attentionMask = Data();
@@ -956,7 +967,8 @@ printf("len = %d, spend = %f s. tokens / s = %f\n", (int)total, spend, (float)to
         if (dataType == DataType::FLOAT32) {
 
         } else if (dataType == DataType::FLOAT16) {
-            AssertInFastLLM(this->model_type == "chatglm" || this->model_type == "llama", 
+            AssertInFastLLM(this->model_type == "chatglm" || this->model_type == "llama" ||
+                            this->model_type == "qwen", 
                             this->model_type + " doesn't support float16");
         } else {
             ErrorInFastLLM("SetDataType Error: datatype should be float32 or float16");

diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp
@@ -831,7 +831,7 @@ namespace fastllm {
                 }
             }
 
-            if (seqLen <= 4096) {
+            if (seqLen <= 1024) {
                 std::vector<float> vmask = std::vector<float>(seqLen * seqLen, 0);
                 for (int i = 0; i < seqLen - 1; i++) {
                     vmask[i * seqLen + seqLen - 1] = 1;

diff --git a/src/models/llama.cpp b/src/models/llama.cpp
@@ -823,6 +823,15 @@ namespace fastllm {
         return lastRet;
     }
 
+    bool LlamaModel::NeedAttentionMask(int qlen, int klen) {
+        return false;
+        if (this->weight.dicts["use_alibi"] != "1" && 
+            ((qlen == 1) || (qlen >= 1024))) {
+            return false;
+        }
+        return true;
+    }
+
     void LlamaModel::FillLLMInputsBatch(std::vector<std::vector<float>> &inputTokens,
                                           const std::vector<std::map<std::string, int>> &params,
                                           fastllm::Data &inputIds, fastllm::Data &attentionMask,