From 455d0cb7288e50fd1037b7393b2d85f4053d2da4 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 9 Jan 2025 15:26:44 +0000 Subject: [PATCH] update to use new attention_type interface Signed-off-by: NickLucche --- vllm/model_executor/models/t5.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/t5.py b/vllm/model_executor/models/t5.py index a8327a6e8128a..50c7c26659507 100644 --- a/vllm/model_executor/models/t5.py +++ b/vllm/model_executor/models/t5.py @@ -202,7 +202,8 @@ def __init__(self, 1.0, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=self.attn_type) # Only the first SelfAttention block in encoder decoder has this # embedding layer, the others reuse its output. @@ -418,12 +419,7 @@ def forward( # Encoder/Decoder Self-Attention Layer, attn bias already cached. assert attn_bias is not None - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=self.attn_type) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) return output