Merge branch 'main' into clean_readme

huggingface · Feb 6, 2025 · f3d9bf3 · f3d9bf3
2 parents ad88caa + 5c7ea15
commit f3d9bf3
Show file tree

Hide file tree

Showing 111 changed files with 6,881 additions and 2,910 deletions.
diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
@@ -60,7 +60,7 @@ jobs:
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
-            /bin/bash tests/ci/slow_tests_diffusers.sh
+            /bin/bash tests/ci/slow_tests_diffusers.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   deepspeed:
     name: Test DeepSpeed models
     if: ${{ !cancelled() && (success() || failure()) }}

diff --git a/Makefile b/Makefile
@@ -98,7 +98,7 @@ slow_tests_deepspeed: test_installs
 
 slow_tests_diffusers: test_installs
 	python -m pip install -r examples/stable-diffusion/requirements.txt
-	python -m pytest tests/test_diffusers.py -v -s -k "test_textual_inversion"
+	python -m pytest tests/test_diffusers.py -v -s -k "textual_inversion"
 	python -m pip install peft==0.7.0
 	python -m pytest tests/test_diffusers.py -v -s -k "test_train_text_to_image_"
 	python -m pytest tests/test_diffusers.py -v -s -k "test_train_controlnet"
@@ -107,6 +107,7 @@ slow_tests_diffusers: test_installs
 
 # Run text-generation non-regression tests
 slow_tests_text_generation_example: test_installs
+	python -m pip install triton==3.1.0 autoawq
 	BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
 	python -m pip install git+https://github.com/HabanaAI/[email protected]
 	python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)
@@ -120,6 +121,11 @@ slow_tests_openclip_vqa_example: test_installs
 	python -m pip install -r examples/visual-question-answering/openclip_requirements.txt
 	python -m pytest tests/test_openclip_vqa.py
 
+# Run video comprehension tests
+slow_tests_video_llava_example: test_installs
+	python -m pip install -r examples/video-comprehension/requirements.txt
+	python -m pytest tests/test_video_llava.py
+
 slow_tests_fsdp: test_installs
 	python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN)
 

diff --git a/README.md b/README.md
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -105,10 +105,12 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | TableTransformer |       | <div style="text-align:left"><li>Single card</li></div> | <li>[table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)</li> |
 | DETR         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)</li> |
 | Mllama     | <div style="text-align:left"><li>LoRA</li></div> |✅      | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Video-LLaVA         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)</li> |
 | MiniCPM3 |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| DeepSeek-V2 |   | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| DeepSeek-V2 | ✅ | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | ChatGLM     | <div style="text-align:left"><li>DeepSpeed</li></div> |  <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Qwen2-VL |          |  <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 
 - Diffusers
 
@@ -119,7 +121,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | Stable Diffusion Depth2img | | <li>Single card</li> | <li>[depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | FLUX.1              | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#dreambooth-lora-fine-tuning-with-flux1-dev)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| Text to Video       |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
+| Text to Video       |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)</li> |
 
 - PyTorch Image Models/TIMM:
 

diff --git a/examples/audio-classification/requirements.txt b/examples/audio-classification/requirements.txt
@@ -1,3 +1,4 @@
 datasets>=1.14.0
 evaluate
+numba==0.60.0
 librosa
diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
@@ -177,6 +177,31 @@ class ModelArguments:
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
     )
+    use_flash_attention: bool = field(
+        default=False, metadata={"help": "Whether to use Habana flash attention for fine-tuning"}
+    )
+    flash_attention_recompute: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to enable recompute in Habana flash attention for fine-tuning."
+            " It is applicable only when use_flash_attention is True."
+        },
+    )
+    flash_attention_fast_softmax: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use fast softmax for Habana flash attention."
+            " It is applicable only when use_flash_attention is True."
+        },
+    )
+
+    def __post_init__(self):
+        if self.flash_attention_recompute:
+            assert self.use_flash_attention, "flash_attention_recompute is set, but use_flash_attention is not"
+            os.environ["FLASH_ATTENTION_RECOMPUTE"] = "1"
+        if self.flash_attention_fast_softmax:
+            assert self.use_flash_attention, "flash_attention_fast_softmax is set, but use_flash_attention is not"
+            os.environ["FLASH_ATTENTION_FAST_SOFTMAX"] = "1"
 
 
 def main():
@@ -364,6 +389,7 @@ def compute_metrics(eval_pred):
         revision=model_args.model_revision,
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
+        attn_implementation="sdpa" if model_args.use_flash_attention else "eager",
     )
     model = AutoModelForAudioClassification.from_pretrained(
         model_args.model_name_or_path,