update to use new attention_type interface

Signed-off-by: NickLucche <[email protected]>
vllm-project · Jan 9, 2025 · 455d0cb · 455d0cb
1 parent 3eae4f6
commit 455d0cb
Showing 1 changed file with 3 additions and 7 deletions.
diff --git a/vllm/model_executor/models/t5.py b/vllm/model_executor/models/t5.py
@@ -202,7 +202,8 @@ def __init__(self,
                               1.0,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
+                              prefix=f"{prefix}.attn",
+                              attn_type=self.attn_type)
 
         # Only the first SelfAttention block in encoder decoder has this
         # embedding layer, the others reuse its output.
@@ -418,12 +419,7 @@ def forward(
             # Encoder/Decoder Self-Attention Layer, attn bias already cached.
             assert attn_bias is not None
 
-        attn_output = self.attn(q,
-                                k,
-                                v,
-                                kv_cache,
-                                attn_metadata,
-                                attn_type=self.attn_type)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.out_proj(attn_output)
         return output