fix: add legacy attributes to LLama attention

The parallelization code expects these parameters to be set. A proper fix would be to write a specific Llama parallel model.
huggingface · Jan 29, 2025 · 565639f · 565639f
1 parent 292fbea
commit 565639f
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
@@ -588,6 +588,10 @@ def _parallelize(
             layers = model.model.layers
 
         for layer in layers:
+            # FIXME: temporary workaround to avoid too many changes in the transformation code
+            layer.self_attn.num_heads = layer.self_attn.config.num_attention_heads
+            layer.self_attn.num_key_value_heads = layer.self_attn.config.num_key_value_heads
+            layer.self_attn.hidden_size = layer.self_attn.config.hidden_size
             layer.self_attn = LlamaParallelSelfAttention.transform(
                 model,
                 layer.self_attn,