diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 9d6b2d4005..cfc901a9e4 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -51,7 +51,7 @@ jobs:
           echo ${{ env.COMMIT_SHA }} > ./commit_sha
           echo ${{ env.PR_NUMBER }} > ./pr_number
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: doc-build-artifact
           path: optimum-habana/habana-doc-build/
diff --git a/.github/workflows/fast_tests.yml b/.github/workflows/fast_tests.yml
index e2d58fbe25..cdd7d1dbf5 100644
--- a/.github/workflows/fast_tests.yml
+++ b/.github/workflows/fast_tests.yml
@@ -21,7 +21,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -36,7 +36,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/fast_tests.sh
   diffusers:
     name: Run tests for optimum.habana.diffusers
@@ -46,7 +46,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -61,5 +61,5 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/fast_tests_diffusers.sh
diff --git a/.github/workflows/slow_tests.yml b/.github/workflows/slow_tests.yml
index b969273a3c..d0fcb85051 100644
--- a/.github/workflows/slow_tests.yml
+++ b/.github/workflows/slow_tests.yml
@@ -19,7 +19,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -31,7 +31,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/example_diff_tests.sh
   stable-diffusion:
     name: Test Stable Diffusion
@@ -45,7 +45,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -57,7 +57,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_diffusers.sh
   deepspeed:
     name: Test DeepSpeed models
@@ -72,7 +72,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -84,7 +84,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_deepspeed.sh
   multi-card:
     name: Test multi-card models
@@ -99,7 +99,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -111,7 +111,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_8x.sh
   single-card:
     name: Test single-card models
@@ -127,7 +127,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -139,7 +139,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_1x.sh
   albert-xxl-single-card:
     name: Test single-card ALBERT XXL
@@ -158,7 +158,7 @@ jobs:
       - name: Pull image
         if: github.event.schedule == '0 21 * * 6'
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run test
         if: github.event.schedule == '0 21 * * 6'
         run: |
@@ -171,7 +171,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/albert_xxl_1x.sh
       - name: Warning
         if: github.event.schedule != '0 21 * * 6'
@@ -192,7 +192,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -204,7 +204,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   trl:
     name: Test TRL integration
@@ -223,7 +223,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -235,7 +235,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_trl.sh
   sentence-transformers:
     name: Test Sentence Transformers integration
@@ -263,7 +263,7 @@ jobs:
           path: sentence-transformers
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -275,5 +275,5 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash optimum-habana/tests/ci/sentence_transformers.sh
diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
index 2e561a2765..86b50d6e2c 100644
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -17,7 +17,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -30,7 +30,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/example_diff_tests.sh
   stable-diffusion:
     name: Test Stable Diffusion
@@ -43,7 +43,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -59,8 +59,8 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
-            /bin/bash tests/ci/slow_tests_diffusers.sh
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
+            /bin/bash tests/ci/slow_tests_diffusers.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   deepspeed:
     name: Test DeepSpeed models
     if: ${{ !cancelled() && (success() || failure()) }}
@@ -72,7 +72,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -88,7 +88,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_deepspeed.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   fsdp:
     name: Test FSDP models
@@ -101,7 +101,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -117,7 +117,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             make slow_tests_fsdp TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   multi-card:
     name: Test multi-card models
@@ -130,7 +130,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -146,7 +146,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_8x.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   single-card:
     name: Test single-card models
@@ -160,7 +160,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -177,7 +177,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_1x.sh
   text-generation:
     name: Test text-generation example
@@ -192,7 +192,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -208,7 +208,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   trl:
     name: Test TRL integration
@@ -221,7 +221,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -237,7 +237,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash tests/ci/slow_tests_trl.sh
   sentence-transformers:
     name: Test Sentence Transformers integration
@@ -258,7 +258,7 @@ jobs:
           path: sentence-transformers
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       - name: Run tests
         run: |
             docker run \
@@ -274,5 +274,5 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest \
+            vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \
             /bin/bash optimum-habana/tests/ci/sentence_transformers.sh
diff --git a/Makefile b/Makefile
index e6989aa1b0..34fd13bd07 100644
--- a/Makefile
+++ b/Makefile
@@ -93,12 +93,12 @@ slow_tests_8x: test_installs
 
 # Run DeepSpeed non-regression tests
 slow_tests_deepspeed: test_installs
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 	python -m pytest tests/test_examples.py -v -s -k "deepspeed"
 
 slow_tests_diffusers: test_installs
 	python -m pip install -r examples/stable-diffusion/requirements.txt
-	python -m pytest tests/test_diffusers.py -v -s -k "test_textual_inversion"
+	python -m pytest tests/test_diffusers.py -v -s -k "textual_inversion"
 	python -m pip install peft==0.7.0
 	python -m pytest tests/test_diffusers.py -v -s -k "test_train_text_to_image_"
 	python -m pytest tests/test_diffusers.py -v -s -k "test_train_controlnet"
@@ -107,8 +107,9 @@ slow_tests_diffusers: test_installs
 
 # Run text-generation non-regression tests
 slow_tests_text_generation_example: test_installs
+	python -m pip install triton==3.1.0 autoawq
 	BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 	python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)
 
 # Run image-to-text non-regression tests
@@ -120,6 +121,11 @@ slow_tests_openclip_vqa_example: test_installs
 	python -m pip install -r examples/visual-question-answering/openclip_requirements.txt
 	python -m pytest tests/test_openclip_vqa.py
 
+# Run video comprehension tests
+slow_tests_video_llava_example: test_installs
+	python -m pip install -r examples/video-comprehension/requirements.txt
+	python -m pytest tests/test_video_llava.py
+
 slow_tests_fsdp: test_installs
 	python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN)
 
diff --git a/README.md b/README.md
index 429caebffd..e363edb26e 100644
--- a/README.md
+++ b/README.md
@@ -25,24 +25,30 @@ limitations under the License.
 
 # Optimum for Intel® Gaudi® Accelerators
 
-Optimum for Intel Gaudi - a.k.a. `optimum-habana` - is the interface between the Transformers and Diffusers libraries and [Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html).
-It provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks.
-The list of officially validated models and tasks is available [here](https://github.com/huggingface/optimum-habana#validated-models). Users can try other of the thousands of Hugging Face models on Intel Gaudi accelerators and tasks with only few changes.
+Optimum for Intel Gaudi - a.k.a. `optimum-habana` - is the interface between the Transformers and Diffusers libraries and
+[Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html). It provides a set of tools enabling easy
+model loading, training and inference on single- and multi-HPU settings for different downstream tasks. The list of officially
+validated models and tasks is available [here](https://github.com/huggingface/optimum-habana#validated-models). Users can
+try other of the thousands of Hugging Face models on Intel Gaudi accelerators and tasks with only few changes.
 
 
 ## What are Intel Gaudi AI Accelerators (HPUs)?
 
 HPUs offer fast model training and inference as well as a great price-performance ratio.
-Check out [this blog post about BLOOM inference](https://huggingface.co/blog/habana-gaudi-2-bloom) and [this post benchmarking Intel Gaudi 2 and NVIDIA A100 GPUs for BridgeTower training](https://huggingface.co/blog/bridgetower) for concrete examples.
+Check out [this blog post about BLOOM inference](https://huggingface.co/blog/habana-gaudi-2-bloom) and
+[this post benchmarking Intel Gaudi 2 and NVIDIA A100 GPUs for BridgeTower training](https://huggingface.co/blog/bridgetower)
+for concrete examples.
 
 
 ## Gaudi Setup
 
 Please refer to the Intel Gaudi AI Accelerator official [installation guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html).
 
-> Tests should be run in a Docker container based on Intel Gaudi Docker images.
->
-> The current version has been validated for SynapseAI 1.18.
+> [!NOTE]
+> Tests should be run in a Docker container based on Intel Gaudi's official images. Instructions to
+> obtain the latest containers from the Intel Gaudi Vault are available
+> [here](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html#use-intel-gaudi-containers).
+> The current Optimum for Intel Gaudi has been validated with Intel Gaudi v1.19 stack.
 
 
 ## Install the library and get example scripts
@@ -50,18 +56,18 @@ Please refer to the Intel Gaudi AI Accelerator official [installation guide](htt
 ### Option 1: Use the latest stable release
 
 To install the latest stable release of this package
->```bash
->pip install --upgrade-strategy eager optimum[habana]
->```
+```bash
+pip install --upgrade-strategy eager optimum[habana]
+```
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-habana` is upgraded to the latest stable release.
 
 To use the example associated with the latest stable release, run:
-> ```
-> git clone https://github.com/huggingface/optimum-habana
-> cd optimum-habana && git checkout v1.14.0
-> ```
-> with `v1.14.0` the version number of this release.
+```bash
+git clone https://github.com/huggingface/optimum-habana
+cd optimum-habana && git checkout v1.15.0
+```
+with `v1.15.0` being the latest Optimum for Intel Gaudi release version.
 
 ### Option 2: Use the latest main branch under development
 
@@ -74,7 +80,8 @@ git clone https://github.com/huggingface/optimum-habana
 
 ### Option 3: Use the `transformers_future` branch to have the latest changes from Transformers
 
-The `transformers_future` branch is regularly updated with the latest changes from the main branches of Optimum Habana and Transformers. This enables you to try out new Transformers features that have not been merged into the main branch yet.
+The `transformers_future` branch is regularly updated with the latest changes from the main branches of Optimum for Intel Gaudi
+and Transformers. This enables you to try out new Transformers features that have not been merged into the main branch yet.
 
 > [!WARNING]
 > The `transformers_future` branch may have some regressions or bugs and may be less stable than the main branch.
@@ -84,34 +91,40 @@ pip install git+https://github.com/huggingface/optimum-habana.git@transformers_f
 git clone -b transformers_future https://github.com/huggingface/optimum-habana
 ```
 
-## Install dependencies
+## Install Dependencies
 
 To use DeepSpeed on HPUs, you also need to run the following command:
->```bash
->pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
->```
+```bash
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
+```
 
 To install the requirements for every example:
->```bash
->cd <example-folder>
->pip install -r requirements.txt
->```
-
+```bash
+cd <example-folder>
+pip install -r requirements.txt
+```
 
 ## How to use it?
 
-### Quick Start
+Optimum for Intel Gaudi was designed with one goal in mind: **to make training and inference straightforward for Transformers
+and Diffusers users, while fully leveraging the power of Intel Gaudi AI Accelerators**.
 
-Optimum for Intel Gaudi was designed with one goal in mind: **to make training and inference straightforward for Transformers and Diffusers users, while fully leveraging the power of Intel Gaudi AI Accelerators**.
-
-#### Transformers Interface
+### Transformers Interface
 
 There are two main classes one needs to know:
-- [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer): the trainer class that takes care of compiling and distributing the model to run on HPUs, and performing training and evaluation.
-- [GaudiConfig](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config): the class that enables to configure Habana Mixed Precision and to decide whether optimized operators and optimizers should be used or not.
 
-The [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer) is very similar to the [Transformers Trainer](https://huggingface.co/docs/transformers/main_classes/trainer), and adapting a script using the Trainer to make it work with Intel Gaudi accelerators will mostly consist in simply swapping the `Trainer` class for the `GaudiTrainer` one.
-That's how most of the [example scripts](https://github.com/huggingface/optimum-habana/tree/main/examples) were adapted from their [original counterparts](https://github.com/huggingface/transformers/tree/main/examples/pytorch).
+- [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer): the trainer class that takes care of
+  compiling and distributing the model to run on HPUs, and performing training and evaluation.
+
+- [GaudiConfig](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config): the class that enables to configure
+  Gaudi Mixed Precision and to decide whether optimized operators and optimizers should be used or not.
+
+The [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer) is very similar to the
+[Transformers Trainer](https://huggingface.co/docs/transformers/main_classes/trainer), and adapting a script using the Trainer to
+make it work with Intel Gaudi accelerators will mostly consist in simply swapping the `Trainer` class for the `GaudiTrainer` one.
+
+That's how most of the [example scripts](https://github.com/huggingface/optimum-habana/tree/main/examples) were adapted from their
+[original counterparts](https://github.com/huggingface/transformers/tree/main/examples/pytorch).
 
 Here is an example:
 ```diff
@@ -141,12 +154,17 @@ Here is an example:
 )
 ```
 
-where `gaudi_config_name` is the name of a model from the [Hub](https://huggingface.co/Habana) (Intel Gaudi configurations are stored in model repositories) or a path to a local Intel Gaudi configuration file (you can see [here](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) how to write your own).
+where `gaudi_config_name` is the name of a model from the [Hub](https://huggingface.co/Habana) (Intel Gaudi configurations
+are stored in model repositories) or a path to a local Intel Gaudi configuration file (you can see
+[here](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) how to write your own).
 
 
-#### Diffusers Interface
+### Diffusers Interface
 
-You can generate images from prompts using Stable Diffusion on Intel Gaudi using the [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline) class and the [`GaudiDDIMScheduler`] which have been both optimized for HPUs. Here is how to use them and the differences with the Diffusers library:
+You can generate images from prompts using Stable Diffusion on Intel Gaudi using the
+[`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline) class and the
+[`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler)
+class which have been both optimized for HPUs. Here is how to use them and the differences with the Diffusers library:
 
 ```diff
 - from diffusers import DDIMScheduler, StableDiffusionPipeline
@@ -167,7 +185,7 @@ model_name = "CompVis/stable-diffusion-v1-4"
 +   gaudi_config="Habana/stable-diffusion",
 )
 
-outputs = generator(
+outputs = pipeline(
     ["An image of a squirrel in Picasso style"],
     num_images_per_prompt=16,
 +   batch_size=4,
@@ -175,6 +193,27 @@ outputs = generator(
 ```
 
 
+## Important Note on Pytorch 2.5 Performance Degradation
+
+With the upgrade to PyTorch 2.5, users may experience some performance degradation due to changes in the handling of FP16/BF16 inputs.
+The note from PyTorch 2.5 states:
+
+"A naive SDPA math backend, when using FP16/BF16 inputs, can accumulate significant numerical errors due to the usage of low-precision
+intermediate buffers. To mitigate this issue, the default behavior now involves upcasting FP16/BF16 inputs to FP32. Computations are performed
+in FP32/TF32, and the final FP32 results are then downcasted back to FP16/BF16. This will improve numerical accuracy of the final output for
+the math backend with FP16/BF16 inputs, but increases memory usages and may cause the performance regressions in the math backend as computations
+shift from FP16/BF16 BMM to FP32/TF32 BMM/Matmul."
+
+For scenarios where reduced-precision reductions are preferred for speed, they can be enabled with the following setting:
+```python
+torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+```
+Additionally, the next release of Optimum Habana will include a Gaudi-specific safe_softmax implementation that will also improve performance.
+
+More info:
+- https://pytorch.org/docs/stable/notes/numerical_accuracy.html
+
+
 ### Documentation
 
 Check out [the documentation of Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/habana/index) for more advanced usage.
@@ -184,108 +223,106 @@ Check out [the documentation of Optimum for Intel Gaudi](https://huggingface.co/
 
 The following model architectures, tasks and device distributions have been validated for Optimum for Intel Gaudi:
 
+> [!NOTE]
 > In the tables below, :heavy_check_mark: means single-card, multi-card and DeepSpeed have all been validated.
 
-- Transformers:
-<div align="center">
+### Transformers:
 
-| Architecture | Training | Inference | <center>Tasks</center> |
-|--------------|:--------:|:---------:|:-----------------------|
+| Architecture | Training | Inference | Tasks |
+|:-------------|:--------:|:---------:|:------|
 | BERT         | :heavy_check_mark: | :heavy_check_mark: | <li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text feature extraction](https://github.com/huggingface/optimum-habana/tree/main/examples/text-feature-extraction)</li> |
 | RoBERTa | :heavy_check_mark: | :heavy_check_mark: | <li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li> |
 | ALBERT | :heavy_check_mark: | :heavy_check_mark: | <li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li> |
 | DistilBERT |:heavy_check_mark: | :heavy_check_mark: | <li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li> |
 | GPT2             | :heavy_check_mark: | :heavy_check_mark: | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| BLOOM(Z) |   | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| StarCoder / StarCoder2 | :heavy_check_mark:  | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| GPT-J | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| GPT-Neo |      | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| GPT-NeoX | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| OPT |   | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| BLOOM(Z) |   | <li>DeepSpeed</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| StarCoder / StarCoder2 | :heavy_check_mark:  | <li>Single-card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| GPT-J | <li>DeepSpeed</li> | <li>Single card</li><li>DeepSpeed</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| GPT-Neo |      | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| GPT-NeoX | <li>DeepSpeed</li> | <li>DeepSpeed</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| OPT |   | <li>DeepSpeed</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Llama 2 / CodeLlama / Llama 3 / Llama Guard / Granite | :heavy_check_mark: | :heavy_check_mark: | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification) (Llama Guard)</li> |
-| StableLM |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Falcon | <div style="text-align:left"><li>LoRA</li></div> | :heavy_check_mark: | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| CodeGen |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| MPT |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Mistral |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Phi | :heavy_check_mark:  | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Mixtral |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Persimmon |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Qwen2 | <div style="text-align:left"><li>Single card</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Qwen2-MoE |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Gemma | :heavy_check_mark:  | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| StableLM |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Falcon | <li>LoRA</li> | :heavy_check_mark: | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| CodeGen |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| MPT |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Mistral |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Phi | :heavy_check_mark:  | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Mixtral |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Persimmon |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Qwen2 | <li>Single card</li> | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Qwen2-MoE |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Gemma | :heavy_check_mark:  | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Gemma2 |  | :heavy_check_mark: | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| XGLM | | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Cohere       |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| XGLM | | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Cohere       |          | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | T5 / Flan T5 | :heavy_check_mark: | :heavy_check_mark: | <li>[summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)</li><li>[translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)</li> |
-| BART |   | <div style="text-align:left"><li>Single card</li></div> | <li>[summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)</li><li>[translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)</li> |
+| BART |   | <li>Single card</li> | <li>[summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)</li><li>[translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)</li> |
 | ViT | :heavy_check_mark: | :heavy_check_mark: | <li>[image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)</li> |
 | Swin | :heavy_check_mark: | :heavy_check_mark: | <li>[image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)</li> |
 | Wav2Vec2 | :heavy_check_mark: | :heavy_check_mark: | <li>[audio classification](https://github.com/huggingface/optimum-habana/tree/main/examples/audio-classification)</li><li>[speech recognition](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition)</li> |
 | Whisper | :heavy_check_mark: | :heavy_check_mark: | <li>[speech recognition](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition)</li> |
-| SpeechT5 |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text to speech](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-speech)</li> |
+| SpeechT5 |   | <li>Single card</li> | <li>[text to speech](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-speech)</li> |
 | CLIP | :heavy_check_mark: | :heavy_check_mark: | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
 | BridgeTower | :heavy_check_mark: | :heavy_check_mark: | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
-| ESMFold |   | <div style="text-align:left"><li>Single card</li></div> | <li>[protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)</li> |
-| Blip |   | <div style="text-align:left"><li>Single card</li></div> | <li>[visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)</li><li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| OWLViT |   | <div style="text-align:left"><li>Single card</li></div> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
-| ClipSeg |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
-| Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| idefics2 | <div style="text-align:left"><li>LoRA</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| Paligemma | | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| Segment Anything Model |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
-| VideoMAE | | <div style="text-align:left"><li>Single card</li></div> | <li>[Video classification](https://github.com/huggingface/optimum-habana/tree/main/examples/video-classification)</li> |
-| TableTransformer |   | <div style="text-align:left"><li>Single card</li></div> | <li>[table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection) </li> |
-| DETR |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)</li> |
-| Mllama     | <div style="text-align:left"><li>LoRA</li></div> | :heavy_check_mark: | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| MiniCPM3 |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| DeepSeek-V2 |   | :heavy_check_mark: | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| ChatGLM | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-</div>
-
-- Diffusers:
-
-<div align="center">
-
-| Architecture     | Training | Inference            | Tasks |
-|------------------|:--------:|:--------------------:|:------|
-| Stable Diffusion | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| Stable Diffusion Depth2img | | <li>Single card</li> | <li>[depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| LDM3D            |          | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| FLUX.1           | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#dreambooth-lora-fine-tuning-with-flux1-dev)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| Text to Video    |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
+| ESMFold |   | <li>Single card</li> | <li>[protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)</li> |
+| Blip |   | <li>Single card</li> | <li>[visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)</li><li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| OWLViT |   | <li>Single card</li> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
+| ClipSeg |   | <li>Single card</li> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
+| Llava / Llava-next |    | <li>Single card</li> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| idefics2 | <li>LoRA</li> | <li>Single card</li> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Paligemma | | <li>Single card</li> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Segment Anything Model |   | <li>Single card</li> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
+| VideoMAE | | <li>Single card</li> | <li>[Video classification](https://github.com/huggingface/optimum-habana/tree/main/examples/video-classification)</li> |
+| TableTransformer |   | <li>Single card</li> | <li>[table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection) </li> |
+| DETR |   | <li>Single card</li> | <li>[object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)</li> |
+| Mllama     | <li>LoRA</li> | :heavy_check_mark: | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| MiniCPM3 |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Baichuan2 | <li>DeepSpeed</li> | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| DeepSeek-V2 | :heavy_check_mark: | :heavy_check_mark: | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| ChatGLM | <li>DeepSpeed</li> | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Qwen2-VL |          |  <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| VideoLLaVA | | <div style="text-align:left"><li>Single card</li></div> | <li>[Video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)</li> |
 
 </div>
 
-- PyTorch Image Models/TIMM:
 
-<div align="center">
+### Diffusers:
 
 | Architecture        | Training | Inference | Tasks |
-|---------------------|:--------:|:---------:|:------|
-| FastViT       |          | <div style="text-align:left"><li>Single card</li></div> | <li>[image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)</li> |
-
-</div>
+|:--------------------|:--------:|:---------:|:------|
+| Stable Diffusion    | :heavy_check_mark: | :heavy_check_mark: | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion)</li><li>[image-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-based-image-to-image)</li> |
+| Stable Diffusion XL | :heavy_check_mark: | :heavy_check_mark: | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-xl-sdxl)</li><li>[image-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-xl-refiner)</li> |
+| Stable Diffusion Depth2img |         | <li>Single card</li> | <li>[depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#depth-to-image-generation)</li> |
+| Stable Diffusion 3  |            | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-3-sd3)</li> |
+| LDM3D            |               | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#latent-diffusion-model-for-3d-ldm3d)</li> |
+| FLUX.1           | <li>LoRA</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#flux1)</li><li>[image-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#flux1-image-to-image)</li> |
+| Text to Video    |               | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)</li> |
+| Image to Video   |               | <li>Single card</li> | <li>[image-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#image-to-video-generation)</li> |
+
+### PyTorch Image Models/TIMM:
 
-- TRL:
+| Architecture        | Training | Inference | Tasks |
+|:--------------------|:--------:|:---------:|:------|
+| FastViT       |          | <li>Single card</li> | <li>[image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)</li> |
 
-<div align="center">
+### TRL:
 
 | Architecture     | Training | Inference            | Tasks                                                                                          |
-|------------------|:--------:|:--------------------:|:-----------------------------------------------------------------------------------------------|
-| Llama 2          | :heavy_check_mark: |           | <li>[DPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)</li>  |
-| Llama 2          | :heavy_check_mark: |           | <li>[PPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)</li>  |
-| Stable Diffusion | :heavy_check_mark: |           | <li>[DDPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)</li> |
-
-</div>
+|:-----------------|:--------:|:--------------------:|:-----------------------------------------------------------------------------------------------|
+| Llama 2          | :heavy_check_mark: |           | <li>[DPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl#dpo-pipeline)</li>  |
+| Llama 2          | :heavy_check_mark: |           | <li>[PPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl#ppo-pipeline)</li>  |
+| Stable Diffusion | :heavy_check_mark: |           | <li>[DDPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl#ddpo-pipeline)</li> |
 
-Other models and tasks supported by the Transformers and Diffusers libraries may also work. You can refer to this [section](https://github.com/huggingface/optimum-habana#how-to-use-it) for using them with Optimum for Intel Gaudi. In addition, [this page](https://github.com/huggingface/optimum-habana/tree/main/examples) explains how to modify any [example](https://github.com/huggingface/transformers/tree/main/examples/pytorch) from the Transformers library to make it work with Optimum for Intel Gaudi.
+Other models and tasks supported by the Transformers and Diffusers libraries may also work. You can refer to this [section](https://github.com/huggingface/optimum-habana#how-to-use-it)
+for using them with Optimum for Intel Gaudi. In addition, [this page](https://github.com/huggingface/optimum-habana/tree/main/examples) explains how to modify any
+[example](https://github.com/huggingface/transformers/tree/main/examples/pytorch) from the Transformers library to make it work with Optimum for Intel Gaudi.
 
 If you find any issues while using those, please open an issue or a pull request.
 
-After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness.
+After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed
+to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on
+the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness.
 
 ## Development
 
diff --git a/conftest.py b/conftest.py
index 5775644c48..71cb6bb7ca 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,88 +1,3 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# tests directory-specific settings - this file is run automatically
-# by pytest before any tests are run
-import doctest
-import sys
-import warnings
-from os.path import abspath, dirname, join
-
-import _pytest
-import pytest
-from transformers.testing_utils import HfDoctestModule, HfDocTestParser
-
-
-NOT_DEVICE_TESTS = {
-    "test_tokenization",
-    "test_processor",
-    "test_processing",
-    "test_beam_constraints",
-    "test_configuration_utils",
-    "test_data_collator",
-    "test_trainer_callback",
-    "test_trainer_utils",
-    "test_feature_extraction",
-    "test_image_processing",
-    "test_image_processor",
-    "test_image_transforms",
-    "test_optimization",
-    "test_retrieval",
-    "test_config",
-    "test_from_pretrained_no_checkpoint",
-    "test_keep_in_fp32_modules",
-    "test_gradient_checkpointing_backward_compatibility",
-    "test_gradient_checkpointing_enable_disable",
-    "test_save_load_fast_init_from_base",
-    "test_fast_init_context_manager",
-    "test_fast_init_tied_embeddings",
-    "test_save_load_fast_init_to_base",
-    "test_torch_save_load",
-    "test_initialization",
-    "test_forward_signature",
-    "test_model_get_set_embeddings",
-    "test_model_main_input_name",
-    "test_correct_missing_keys",
-    "test_tie_model_weights",
-    "test_can_use_safetensors",
-    "test_load_save_without_tied_weights",
-    "test_tied_weights_keys",
-    "test_model_weights_reload_no_missing_tied_weights",
-    "test_pt_tf_model_equivalence",
-    "test_mismatched_shapes_have_properly_initialized_weights",
-    "test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
-    "test_model_is_small",
-    "test_tf_from_pt_safetensors",
-    "test_flax_from_pt_safetensors",
-    "ModelTest::test_pipeline_",  # None of the pipeline tests from PipelineTesterMixin (of which XxxModelTest inherits from) are running on device
-    "ModelTester::test_pipeline_",
-    "/repo_utils/",
-    "/utils/",
-    "/agents/",
-}
-
-# allow having multiple repository checkouts and not needing to remember to rerun
-# `pip install -e '.[dev]'` when switching between checkouts and running tests.
-git_repo_path = abspath(join(dirname(__file__), "src"))
-sys.path.insert(1, git_repo_path)
-
-# silence FutureWarning warnings in tests since often we can't act on them until
-# they become normal warnings - i.e. the tests still need to test the current functionality
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
 class Secret:
     """
     Taken from: https://stackoverflow.com/a/67393351
@@ -98,47 +13,9 @@ def __str___(self):
         return "*******"
 
 
-def pytest_configure(config):
-    config.addinivalue_line(
-        "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
-    )
-    config.addinivalue_line(
-        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
-    )
-    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
-    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
-    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
-    config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
-    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
-
-
-def pytest_collection_modifyitems(items):
-    for item in items:
-        if any(test_name in item.nodeid for test_name in NOT_DEVICE_TESTS):
-            item.add_marker(pytest.mark.not_device_test)
-
-
 def pytest_addoption(parser):
     parser.addoption("--token", action="store", default=None)
 
-    from transformers.testing_utils import pytest_addoption_shared
-
-    pytest_addoption_shared(parser)
-
-
-def pytest_terminal_summary(terminalreporter):
-    from transformers.testing_utils import pytest_terminal_summary_main
-
-    make_reports = terminalreporter.config.getoption("--make-reports")
-    if make_reports:
-        pytest_terminal_summary_main(terminalreporter, id=make_reports)
-
-
-def pytest_sessionfinish(session, exitstatus):
-    # If no tests are collected, pytest exists with code 5, which makes the CI fail.
-    if exitstatus == 5:
-        session.exitstatus = 0
-
 
 def pytest_generate_tests(metafunc):
     # This is called for every test. Only get/set command line arguments
@@ -146,21 +23,3 @@ def pytest_generate_tests(metafunc):
     option_value = Secret(metafunc.config.option.token)
     if "token" in metafunc.fixturenames:
         metafunc.parametrize("token", [option_value])
-
-
-# Doctest custom flag to ignore output.
-IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
-
-OutputChecker = doctest.OutputChecker
-
-
-class CustomOutputChecker(OutputChecker):
-    def check_output(self, want, got, optionflags):
-        if IGNORE_RESULT & optionflags:
-            return True
-        return OutputChecker.check_output(self, want, got, optionflags)
-
-
-doctest.OutputChecker = CustomOutputChecker
-_pytest.doctest.DoctestModule = HfDoctestModule
-doctest.DocTestParser = HfDocTestParser
diff --git a/docs/Dockerfile b/docs/Dockerfile
index 6dd8d3a29f..060b7413dc 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 
 ARG commit_sha
 ARG clone_url
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 51d6dadf0f..2b8cdf06ef 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -105,10 +105,12 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | TableTransformer |       | <div style="text-align:left"><li>Single card</li></div> | <li>[table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)</li> |
 | DETR         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)</li> |
 | Mllama     | <div style="text-align:left"><li>LoRA</li></div> |✅      | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Video-LLaVA         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)</li> |
 | MiniCPM3 |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| DeepSeek-V2 |   | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| DeepSeek-V2 | ✅ | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | ChatGLM     | <div style="text-align:left"><li>DeepSpeed</li></div> |  <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| Qwen2-VL |          |  <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 
 - Diffusers
 
@@ -119,7 +121,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | Stable Diffusion Depth2img | | <li>Single card</li> | <li>[depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | FLUX.1              | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#dreambooth-lora-fine-tuning-with-flux1-dev)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| Text to Video       |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
+| Text to Video       |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)</li> |
 
 - PyTorch Image Models/TIMM:
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2b6e8a0a5c..fa54c4446e 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -24,7 +24,7 @@ python -m pip install --upgrade-strategy eager optimum[habana]
 To use Microsoft® DeepSpeed with Intel Gaudi devices, you also need to run the following command:
 
 ```bash
-python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 To ensure that you are installing the correct Intel Gaudi Software, please run the `hl-smi` command to confirm the software version
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
index ec79ac05f9..c882de2629 100644
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@@ -32,12 +32,12 @@ platform for deep learning and follow the steps to start and connect to the node
 ## Docker Setup
 
 Now that you have access to the node, you will use the latest Intel Gaudi AI Accelerator docker image by executing the docker run command which will
-automatically download and run the docker. At the time of writing this guide, latest Gaudi docker version was 1.18.0:
+automatically download and run the docker. At the time of writing this guide, latest Gaudi docker version was 1.19.0:
 
 ```bash
-release=1.18.0
+release=1.19.0
 os=ubuntu22.04
-torch=2.4.0
+torch=2.5.1
 docker_image=vault.habana.ai/gaudi-docker/$release/$os/habanalabs/pytorch-installer-$torch:latest
 ```
 <Tip>
@@ -65,11 +65,11 @@ docker run -itd \
 ## Optimum for Intel Gaudi Setup
 
 Check latest release of Optimum for Intel Gaudi [here](https://github.com/huggingface/optimum-habana/releases).
-At the time of writing this guide, latest Optimum for Intel Gaudi release version was v1.14.0, which is paired with Intel Gaudi Software release
-version 1.18.0.  Install Optimum for Intel Gaudi as follows:
+At the time of writing this guide, latest Optimum for Intel Gaudi release version was v1.15.0, which is paired with Intel Gaudi Software release
+version 1.19.0.  Install Optimum for Intel Gaudi as follows:
 
 ```bash
-git clone -b v1.14.0 https://github.com/huggingface/optimum-habana
+git clone -b v1.15.0 https://github.com/huggingface/optimum-habana
 pip install ./optimum-habana
 ```
 
@@ -115,7 +115,7 @@ Microsoft® DeepSpeed. Gaudi-specific fork of the library is maintained by Intel
 
 To install the library compatible with the same Gaudi software release stack, use:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 With DeepSpeed successfully installed we can now run a distributed GPT-2 inference on an 8 HPU system as follows:
@@ -135,7 +135,7 @@ run_generation.py \
 
 🤗 Optimum for Intel Gaudi contains a number of examples demonstrating single and multi Gaudi device training/fine-tuning.
 
-For example, a number of language models can be trained with the scripts provided 
+For example, a number of language models can be trained with the scripts provided
 [language modeling examples section](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling).
 
 As an illustration, let us run GPT-2 single and multi card training examples on Gaudi.
@@ -239,7 +239,7 @@ outputs = pipeline(
 )
 ```
 
-In addition, sample scripts for fine-tuning diffusion models are given in 
+In addition, sample scripts for fine-tuning diffusion models are given in
 [Stable Diffusion training section](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training).
 
 A more comprehensive list of examples in Optimum for Intel Gaudi is given next.
diff --git a/docs/source/usage_guides/deepspeed.mdx b/docs/source/usage_guides/deepspeed.mdx
index 833358d9c4..f6617e92ce 100644
--- a/docs/source/usage_guides/deepspeed.mdx
+++ b/docs/source/usage_guides/deepspeed.mdx
@@ -32,7 +32,7 @@ You can find more information about DeepSpeed Gaudi integration [here](https://d
 To use DeepSpeed on Gaudi, you need to install Optimum for Intel Gaudi and [DeepSpeed fork for Intel Gaudi](https://github.com/HabanaAI/DeepSpeed) with:
 ```bash
 pip install optimum[habana]
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 
@@ -79,7 +79,7 @@ It is strongly advised to read [this section](https://huggingface.co/docs/transf
 
 </Tip>
 
-Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.18.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Intel.
+Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.19.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Intel.
 
 The [Transformers documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed#configuration) explains how to write a configuration from scratch very well.
 A more complete description of all configuration possibilities is available [here](https://www.deepspeed.ai/docs/config-json/).
diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md
index aaa45425cc..64f5e0daba 100644
--- a/examples/audio-classification/README.md
+++ b/examples/audio-classification/README.md
@@ -56,6 +56,7 @@ python run_audio_classification.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code True
 ```
@@ -93,6 +94,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --use_lazy_mode False\
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code True \
     --torch_compile \
@@ -105,52 +107,6 @@ On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.4
 
 > If you get an error reporting unused parameters in the model, you can specify `--ddp_find_unused_parameters True`. Using this parameter might affect the training speed.
 
-
-## DeepSpeed
-
-> You need to install DeepSpeed with:
-> ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
-> ```
-
-DeepSpeed can be used with almost the same command as for a multi-card run:
-- `use_mpi` should be replaced by `use_deepspeed`,
-- an additional `--deepspeed path_to_my_deepspeed config` argument should be provided, for instance `--deepspeed ../../tests/configs/deepspeed_zero_2.json`.
-
-For example:
-```bash
-PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_audio_classification.py \
-    --model_name_or_path facebook/wav2vec2-base \
-    --dataset_name common_language \
-    --audio_column_name audio \
-    --label_column_name language \
-    --output_dir /tmp/wav2vec2-base-lang-id \
-    --overwrite_output_dir \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-4 \
-    --max_length_seconds 8 \
-    --attention_mask False \
-    --warmup_ratio 0.1 \
-    --num_train_epochs 10 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 32 \
-    --seed 0 \
-    --use_habana \
-    --use_lazy_mode False\
-    --gaudi_config_name Habana/wav2vec2 \
-    --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
-    --trust_remote_code True
-```
-
-[The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
 ## Inference
 
 To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
@@ -164,6 +120,7 @@ python run_audio_classification.py \
     --output_dir /tmp/wav2vec2-base-ft-keyword-spotting \
     --overwrite_output_dir \
     --remove_unused_columns False \
+    --bf16 \
     --do_eval \
     --max_length_seconds 1 \
     --attention_mask False \
@@ -172,11 +129,9 @@ python run_audio_classification.py \
     --use_habana \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
+    --throughput_warmup_steps 3 \
     --gaudi_config_name Habana/wav2vec2 \
-    --bf16 \
-    --trust_remote_code True\
-    --torch_compile \
-    --torch_compile_backend hpu_backend
+    --trust_remote_code
 ```
 
 
diff --git a/examples/audio-classification/requirements.txt b/examples/audio-classification/requirements.txt
index 720a5a4abc..bae36f7451 100644
--- a/examples/audio-classification/requirements.txt
+++ b/examples/audio-classification/requirements.txt
@@ -1,3 +1,4 @@
 datasets>=1.14.0
 evaluate
+numba==0.60.0
 librosa
diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
index 9a23428866..c93e88def0 100644
--- a/examples/audio-classification/run_audio_classification.py
+++ b/examples/audio-classification/run_audio_classification.py
@@ -47,7 +47,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
@@ -177,6 +177,31 @@ class ModelArguments:
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
     )
+    use_flash_attention: bool = field(
+        default=False, metadata={"help": "Whether to use Habana flash attention for fine-tuning"}
+    )
+    flash_attention_recompute: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to enable recompute in Habana flash attention for fine-tuning."
+            " It is applicable only when use_flash_attention is True."
+        },
+    )
+    flash_attention_fast_softmax: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use fast softmax for Habana flash attention."
+            " It is applicable only when use_flash_attention is True."
+        },
+    )
+
+    def __post_init__(self):
+        if self.flash_attention_recompute:
+            assert self.use_flash_attention, "flash_attention_recompute is set, but use_flash_attention is not"
+            os.environ["FLASH_ATTENTION_RECOMPUTE"] = "1"
+        if self.flash_attention_fast_softmax:
+            assert self.use_flash_attention, "flash_attention_fast_softmax is set, but use_flash_attention is not"
+            os.environ["FLASH_ATTENTION_FAST_SOFTMAX"] = "1"
 
 
 def main():
@@ -364,6 +389,7 @@ def compute_metrics(eval_pred):
         revision=model_args.model_revision,
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
+        attn_implementation="sdpa" if model_args.use_flash_attention else "eager",
     )
     model = AutoModelForAudioClassification.from_pretrained(
         model_args.model_name_or_path,
diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
index 7a095bc9ca..d21eece8bf 100644
--- a/examples/contrastive-image-text/README.md
+++ b/examples/contrastive-image-text/README.md
@@ -115,6 +115,7 @@ PT_HPU_LAZY_MODE=0 python run_clip.py \
     --gaudi_config_name Habana/clip \
     --throughput_warmup_steps 3 \
     --dataloader_num_workers 16 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code \
     --torch_compile_backend=hpu_backend \
@@ -127,32 +128,34 @@ PT_HPU_LAZY_MODE=0 python run_clip.py \
 Run the following command for distributed training:
 
 ```bash
-PT_HPU_LAZY_MODE=0 \
-python ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \
-    --output_dir ./clip-roberta-finetuned \
-    --model_name_or_path ./clip-roberta \
+PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 \
+python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \
+    --output_dir=/tmp/clip_roberta \
+    --model_name_or_path=./clip-roberta \
     --data_dir $PWD/data \
     --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name=2017 \
+    --dataset_config_name 2017 \
     --image_column image_path \
     --caption_column caption \
     --remove_unused_columns=False \
-    --do_train  --do_eval \
-    --per_device_train_batch_size="512" \
+    --do_train --do_eval \
+    --mediapipe_dataloader \
+    --per_device_train_batch_size="64" \
     --per_device_eval_batch_size="64" \
     --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
     --overwrite_output_dir \
-    --save_strategy epoch \
     --use_habana \
-    --gaudi_config_name Habana/clip \
-    --throughput_warmup_steps 3 \
-    --dataloader_num_workers 16 \
-    --mediapipe_dataloader \
-    --bf16 \
-    --distribution_strategy fast_ddp \
-    --trust_remote_code \
+    --use_lazy_mode=False \
+    --gaudi_config_name="Habana/clip" \
+    --throughput_warmup_steps=3 \
+    --save_strategy="no" \
+    --dataloader_num_workers=2 \
+    --use_hpu_graphs \
+    --max_steps=100 \
     --torch_compile_backend=hpu_backend \
-    --torch_compile
+    --torch_compile \
+    --logging_nan_inf_filter \
+    --trust_remote_code
 ```
 
 > `--mediapipe_dataloader` only works on Gaudi2.
@@ -160,54 +163,8 @@ python ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \
 
 ### DeepSpeed
 
-Run the following command for training with DeepSpeed:
-
-```bash
-PT_HPU_LAZY_MODE=0 \
-python ../gaudi_spawn.py --world_size 8 --use_deepspeed run_clip.py \
-    --output_dir ./clip-roberta-finetuned \
-    --model_name_or_path ./clip-roberta \
-    --data_dir $PWD/data \
-    --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name=2017 \
-    --image_column image_path \
-    --caption_column caption \
-    --remove_unused_columns=False \
-    --do_train  --do_eval \
-    --per_device_train_batch_size="512" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --save_strategy epoch \
-    --use_habana \
-    --gaudi_config_name Habana/clip \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config \
-    --trust_remote_code \
-    --torch_compile_backend=hpu_backend \
-    --torch_compile
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
+You can check the [DeepSpeed](https://github.com/huggingface/optimum-habana/tree/main/examples#deepspeed) section in Optimum Habana examples for how to run DeepSpeed.
+You can also look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
 
 
 ## BridgeTower
@@ -265,6 +222,7 @@ python run_clip.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/clip \
     --bf16 \
+    --sdp_on_bf16 \
     --mediapipe_dataloader \
     --trust_remote_code
 ```
diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py
index b54ca8e7c0..5964b2cdcc 100644
--- a/examples/contrastive-image-text/run_bridgetower.py
+++ b/examples/contrastive-image-text/run_bridgetower.py
@@ -59,7 +59,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py
index 6a8ca235e1..fc3bb4886e 100644
--- a/examples/contrastive-image-text/run_clip.py
+++ b/examples/contrastive-image-text/run_clip.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/gaudi_spawn.py b/examples/gaudi_spawn.py
index 0f76dcd379..f282809a31 100644
--- a/examples/gaudi_spawn.py
+++ b/examples/gaudi_spawn.py
@@ -84,7 +84,7 @@ def main():
         if not is_deepspeed_available():
             raise ImportError(
                 "--use_deepspeed requires deepspeed: `pip install"
-                " git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
+                " git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0`."
             )
 
     # Patch sys.argv
diff --git a/examples/image-classification/README.md b/examples/image-classification/README.md
index 08c4d67123..01b19b25ba 100644
--- a/examples/image-classification/README.md
+++ b/examples/image-classification/README.md
@@ -57,6 +57,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 6 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -107,6 +108,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 3 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -211,6 +213,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 8 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -298,6 +301,7 @@ python run_image_classification.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/vit \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
index b2694665a3..bc45087f9e 100644
--- a/examples/image-classification/run_image_classification.py
+++ b/examples/image-classification/run_image_classification.py
@@ -64,7 +64,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 51f4a5dda2..7a8ad04664 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -17,102 +17,12 @@ limitations under the License.
 # Image to Text Examples
 This directory contains a script that showcases how to perform image to text generation on Intel® Gaudi® AI Accelerators.
 
-## Single-HPU inference
+Habana FusedSDPA is a fused and optimized implementation of torch.nn.functional.scaled_dot_product_attention() for Gaudi. For more details, refer to [Gaudi online documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html?highlight=fusedsdpa#using-fused-scaled-dot-product-attention-fusedsdpa). We optimized many models with FusedSDPA implementation as in [optimum/habana/transformers/models](https://github.com/huggingface/optimum-habana/tree/main/optimum/habana/transformers/models). If a model is not optimized with FusedSDPA, it uses [SDPA implementation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html).
 
-Models that have been validated:
-  - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)
-  - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large)
-  - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
-  - [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
-  - [llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)
-  - [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
-  - [llava-hf/llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf)
-  - [llava-hf/llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)
-  - [llava-hf/llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)
-  - [llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)
-  - [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b)
-  - [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
-  - [meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)
-  - [tiiuae/falcon-11B-vlm](https://huggingface.co/tiiuae/falcon-11B-vlm)
-  - [google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)
+## Inference with mixed-precision (BF16)
 
-### Inference with BF16
-
-To run Salesforce/blip-image-captioning-large inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path Salesforce/blip-image-captioning-large \
-    --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run Llava-1.5-7b inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-1.5-7b-hf \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run Llava-1.5-13b inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-1.5-13b-hf \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run Llava-v1.6-mistral-7b inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run Llava-v1.6-vicuna-13b inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-34b-hf \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run google/paligemma-3b-mix-224 inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path google/paligemma-3b-mix-224 \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path llava-hf/llama3-llava-next-8b-hf \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run idefics2 inference, use the following command:
-
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path HuggingFaceM4/idefics2-8b \
-    --use_hpu_graphs \
-    --bf16
-```
-
-To run mllama inference using reduced precision in the SDPA, use the following command:
+### Single card inference with BF16
+To run Llama inference with SDPA, use the following command:
 
 ```bash
 python3 run_pipeline.py \
@@ -121,87 +31,53 @@ python3 run_pipeline.py \
     --bf16 \
     --sdp_on_bf16
 ```
+> SDPA may introduce [reduced precison](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-reduction-for-fp16-and-bf16-in-scaled-dot-product-attention-sdpa)
 
-### Inference with FP8
-Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using  [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch.
 
-More information on enabling FP8 in SynapseAI is available here:
-https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
+### Multi-cards inference with BF16
 
-Here is an example to measure the tensor quantization statistics on Llava-1.5-7b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
-    --model_name_or_path llava-hf/llava-1.5-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16
-```
-
-Here is an example to quantize the model based on previous measurements for Llava-1.5-7b:
+Use the following commands to run Llama-3.2-90B-Vision-Instruct BF16 inference with FusedSDPA on 8 HPUs:
 ```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \
-    --model_name_or_path llava-hf/llava-1.5-7b-hf \
+PT_HPU_ENABLE_LAZY_COLLECTIVES=true python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
+    --model_name_or_path meta-llama/Llama-3.2-90B-Vision-Instruct \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --use_flash_attention \
+    --flash_attention_recompute
 ```
 
+## Inference with FP8
 
-Here is an example to measure the tensor quantization statistics on Llava-v1.6-mistral-7b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16
-```
-
-Here is an example to quantize the model based on previous measurements for Llava-v1.6-mistral-7b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16
-```
+Inference with FP8 precision is enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/index.html?highlight=inc), which provides model measurement and quantization capabilities in PyTorch.
+More information on enabling FP8 in SynapseAI is available here:
+[Run Inference Using FP8](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=fp8)
 
-Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b:
+### Single card inference with FP8
+Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b with SDPA:
 ```bash
 QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
-Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b:
+Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b with SDPA:
 ```bash
 QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
-```
-
-### Inference with FusedSDPA
-
-Habana FusedSDPA is a fused and optimized implementation of torch.nn.functional.scaled_dot_product_attention() for Gaudi. For more details, refer to [Gaudi online documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html?highlight=fusedsdpa#using-fused-scaled-dot-product-attention-fusedsdpa).
-
-Use the following command to run Llava-1.5-7b BF16 inference with FusedSDPA
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-1.5-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
     --bf16 \
-    --use_flash_attention \
-    --flash_attention_recompute
+    --sdp_on_bf16
 ```
 
-
-Use the following command to run Llava-v1.6-mistral-7b BF16 inference with FusedSDPA
+### Multi-cards inference with FP8
+Here is an example of measuring the tensor quantization statistics on Llava-v1.6-mistral-7b with FusedSDPA on 8 HPUs:
 ```bash
-python3 run_pipeline.py \
+QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
@@ -210,12 +86,9 @@ python3 run_pipeline.py \
     --flash_attention_recompute
 ```
 
-
-Use the following commands to run Llava-v1.6-mistral-7b FP8 inference with FusedSDPA
-
-Here is an example of measuring the tensor quantization statistics on Llava-v1.6-mistral-7b:
+Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b with FusedSDPA on 8 HPUs:
 ```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
+QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
@@ -224,88 +97,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --flash_attention_recompute
 ```
 
-Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16 \
-    --use_flash_attention \
-    --flash_attention_recompute
-```
 ## LORA Finetune
 
-To run LoRA finetuning, you can use `run_image2text_lora_finetune.py`.
-Here are single-/multi-device command examples for HuggingFaceM4/idefics2-8b.
-
-```bash
-python3 run_image2text_lora_finetune.py \
-    --model_name_or_path HuggingFaceM4/idefics2-8b \
-    --dataset_name nielsr/docvqa_1200_examples \
-    --bf16 True \
-    --output_dir ./model_lora_llama \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 2 \
-    --gradient_accumulation_steps 8 \
-    --weight_decay 0.01 \
-    --logging_steps 25 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 5e-5 \
-    --warmup_steps  50 \
-    --lr_scheduler_type "constant" \
-    --input_column_names 'image' 'query' \
-    --output_column_names 'answers' \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --lora_rank=8 \
-    --lora_alpha=8 \
-    --lora_dropout=0.1 \
-    --max_seq_length=512 \
-    --use_hpu_graphs_for_inference \
-    --low_cpu_mem_usage True \
-    --lora_target_modules '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
-```
-
-```bash
-python3 ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_image2text_lora_finetune.py \
-    --model_name_or_path HuggingFaceM4/idefics2-8b \
-    --dataset_name nielsr/docvqa_1200_examples \
-    --bf16 True \
-    --output_dir ./model_lora_llama \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 2 \
-    --gradient_accumulation_steps 8 \
-    --weight_decay 0.01 \
-    --logging_steps 25 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 5e-5 \
-    --warmup_steps  50 \
-    --lr_scheduler_type "constant" \
-    --input_column_names 'image' 'query' \
-    --output_column_names 'answers' \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --lora_rank=8 \
-    --lora_alpha=8 \
-    --lora_dropout=0.1 \
-    --max_seq_length=512 \
-    --use_hpu_graphs_for_inference \
-    --low_cpu_mem_usage True \
-    --lora_target_modules '".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$"'
-```
-
 Here are single-/multi-device command examples for meta-llama/Llama-3.2-11B-Vision-Instruct.
 
 ```bash
@@ -375,54 +168,6 @@ python3 ../gaudi_spawn.py \
     --lora_target_modules '".*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$"'
 ```
 
-## Multi-HPU inference
-
-### BF16 Inference with FusedSDPA on 8 HPUs
-
-Use the following commands to run Llava-v1.6-mistral-7b BF16 inference with FusedSDPA on 8 HPUs:
-```bash
-python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16 \
-    --use_flash_attention \
-    --flash_attention_recompute
-```
-
-Use the following commands to run Llama-3.2-90B-Vision-Instruct BF16 inference with FusedSDPA on 8 HPUs:
-```bash
-PT_HPU_ENABLE_LAZY_COLLECTIVES=true python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
-    --model_name_or_path meta-llama/Llama-3.2-90B-Vision-Instruct \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16 \
-    --use_flash_attention \
-    --flash_attention_recompute
-```
-
-
-### FP8 Inference with FusedSDPA on 8 HPUs
-
-Use the following commands to run Llava-v1.6-mistral-7b FP8 inference with FusedSDPA on 8 HPUs.
-Here is an example of measuring the tensor quantization statistics on Llava-v1.6-mistral-7b on 8 HPUs:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16 \
-    --use_flash_attention \
-    --flash_attention_recompute
-```
-
-Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b on 8 HPUs:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
-    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
-    --image_path "https://llava-vl.github.io/static/images/view.jpg" \
-    --use_hpu_graphs \
-    --bf16 \
-    --use_flash_attention \
-    --flash_attention_recompute
-```
+>  For different models, please adjust training parameters and `lora_target_modules`. Such as replace `lora_target_modules`
+>  with below for HuggingFaceM4/idefics2-8b.
+>  '".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$"'
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index 44eb8d575a..de849e3469 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -25,6 +25,10 @@
 import torch
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoProcessor, pipeline
 
+from optimum.habana.utils import (
+    set_seed,
+)
+
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -179,6 +183,23 @@ def main():
         action="store_true",
         help="Allow PyTorch to use reduced precision in the SDPA math backend",
     )
+    parser.add_argument(
+        "--max_input_tokens",
+        type=int,
+        default=None,
+        help="If > 0 then pad the input sequences to this specified length of tokens. will not apply truncate to avoid deleting the image tag",
+    )
+    parser.add_argument(
+        "--do_sample",
+        action="store_true",
+        help="Whether to use sampling for generation.",
+    )
+    parser.add_argument(
+        "--seed",
+        default=27,
+        type=int,
+        help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.",
+    )
 
     args = parser.parse_args()
 
@@ -192,14 +213,18 @@ def main():
     os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
     if args.world_size > 0:
         os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")
+        os.environ.setdefault("DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1")
 
     from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
     adapt_transformers_to_gaudi()
 
+    set_seed(args.seed)
+
     config = AutoConfig.from_pretrained(args.model_name_or_path)
     model_type = config.model_type
-    if args.image_path is None and model_type in ["llava", "idefics2", "mllama"]:
+
+    if args.image_path is None and model_type in ["llava", "idefics2", "mllama", "qwen2_vl"]:
         args.image_path = ["https://llava-vl.github.io/static/images/view.jpg"]
     elif args.image_path is None and model_type == "paligemma":
         args.image_path = [
@@ -210,8 +235,8 @@ def main():
             "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
         ]
 
-    if model_type in ["llava", "idefics2", "llava_next", "mllama", "paligemma"]:
-        processor = AutoProcessor.from_pretrained(args.model_name_or_path)
+    if model_type in ["llava", "idefics2", "llava_next", "mllama", "paligemma", "qwen2_vl"]:
+        processor = AutoProcessor.from_pretrained(args.model_name_or_path, padding_side="left")
         if args.prompt is None:
             if processor.chat_template is not None:
                 conversation = [
@@ -289,6 +314,9 @@ def main():
         generator = pipeline(
             "image-to-text",
             model=args.model_name_or_path,
+            config=args.model_name_or_path,
+            tokenizer=args.model_name_or_path,
+            image_processor=args.model_name_or_path,
             torch_dtype=model_dtype,
             device="hpu",
         )
@@ -308,6 +336,7 @@ def main():
         "use_flash_attention": args.use_flash_attention,
         "flash_attention_recompute": args.flash_attention_recompute,
         "limit_hpu_graphs": args.limit_hpu_graphs,
+        "do_sample": args.do_sample,
     }
 
     if args.sdp_on_bf16:
@@ -316,17 +345,27 @@ def main():
     if args.use_kv_cache:
         generate_kwargs["use_cache"] = args.use_kv_cache
 
+    if model_type == "qwen2_vl":
+        generate_kwargs["use_cache"] = True
+        generate_kwargs["cache_implementation"] = "static"
+
     if args.quant_config:
         generator.model = setup_quantization(generator.model, args)
         htcore.hpu_initialize(generator.model)
 
     # delete once pipeline integrate AutoProcessor as preprocess engine
-    if model_type in ["idefics2", "mllama", "paligemma"]:
+    # could use "image-text-to-text" pipeline in transformers 4.47
+
+    if model_type in ["idefics2", "mllama", "paligemma", "qwen2_vl", "llava", "llava_next"]:
         from transformers.image_utils import load_image
 
         def preprocess(self, image, prompt=None, timeout=None):
+            kwargs = {}
+            if args.max_input_tokens is not None and args.max_input_tokens > 0:
+                kwargs["max_length"] = args.max_input_tokens
+                kwargs["padding"] = "max_length"
             image = load_image(image, timeout=timeout)
-            model_inputs = processor(images=image, text=prompt, return_tensors=self.framework)
+            model_inputs = processor(images=image, text=prompt, return_tensors=self.framework, **kwargs)
             return model_inputs
 
         generator.__class__.preprocess = preprocess
@@ -355,7 +394,7 @@ def preprocess(self, image, prompt=None, timeout=None):
     throughput = total_new_tokens_generated / duration
     logger.info(f"result = {result}")
     logger.info(
-        f"time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
+        f"time = {(end - start) * 1000 / args.n_iterations}ms, Throughput (including tokenization) = {throughput} tokens/second"
     )
 
     # Store results if necessary
diff --git a/examples/kubernetes/Chart.yaml b/examples/kubernetes/Chart.yaml
index dc0400ccb0..d1c1778076 100644
--- a/examples/kubernetes/Chart.yaml
+++ b/examples/kubernetes/Chart.yaml
@@ -3,7 +3,7 @@ name: optimum-habana-example-chart
 description: This Helm chart deploys example jobs using Optimum for Intel® Gaudi® Accelerators to a Kubernetes cluster.
 
 # Compatible Kubernetes versions
-kubeVersion: 1.27-1.29
+kubeVersion: 1.27 - 1.29
 
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
diff --git a/examples/kubernetes/Dockerfile b/examples/kubernetes/Dockerfile
index 08f2937fca..7ebfd93894 100644
--- a/examples/kubernetes/Dockerfile
+++ b/examples/kubernetes/Dockerfile
@@ -1,7 +1,7 @@
-ARG GAUDI_SW_VER=1.18.0
+ARG GAUDI_SW_VER=1.19.0
 ARG OS=ubuntu22.04
-ARG TORCH_VER=2.4.0
-ARG OPTIMUM_HABANA_VER=1.14.0
+ARG TORCH_VER=2.5.1
+ARG OPTIMUM_HABANA_VER=1.15.0
 
 FROM vault.habana.ai/gaudi-docker/${GAUDI_SW_VER}/${OS}/habanalabs/pytorch-installer-${TORCH_VER}:latest AS optimum-habana
 
diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md
index 2ba6b017f1..06f4f01d09 100644
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -43,12 +43,12 @@ Use the the following commands to build the containers:
 
 ```bash
 # Specify the Gaudi SW version, OS, and PyTorch version which will be used for the base container
-export GAUDI_SW_VER=1.18.0
+export GAUDI_SW_VER=1.19.0
 export OS=ubuntu22.04
-export TORCH_VER=2.4.0
+export TORCH_VER=2.5.1
 
 # Specify the version of optimum-habana to install in the container
-export OPTIMUM_HABANA_VER=1.14.0
+export OPTIMUM_HABANA_VER=1.15.0
 
 git clone https://github.com/huggingface/optimum-habana.git
 
diff --git a/examples/kubernetes/README.md.gotmpl b/examples/kubernetes/README.md.gotmpl
index 52a2c4fbab..431f8ad611 100644
--- a/examples/kubernetes/README.md.gotmpl
+++ b/examples/kubernetes/README.md.gotmpl
@@ -43,12 +43,12 @@ Use the the following commands to build the containers:
 
 ```bash
 # Specify the Gaudi SW version, OS, and PyTorch version which will be used for the base container
-export GAUDI_SW_VER=1.18.0
+export GAUDI_SW_VER=1.19.0
 export OS=ubuntu22.04
-export TORCH_VER=2.4.0
+export TORCH_VER=2.5.1
 
 # Specify the version of optimum-habana to install in the container
-export OPTIMUM_HABANA_VER=1.14.0
+export OPTIMUM_HABANA_VER=1.15.0
 
 git clone https://github.com/huggingface/optimum-habana.git
 
diff --git a/examples/kubernetes/docker-compose.yaml b/examples/kubernetes/docker-compose.yaml
index 214707eccb..6bdea75bbd 100644
--- a/examples/kubernetes/docker-compose.yaml
+++ b/examples/kubernetes/docker-compose.yaml
@@ -5,30 +5,30 @@ services:
         http_proxy: ${http_proxy:-""}
         https_proxy: ${https_proxy:-""}
         no_proxy: ${no_proxy:-""}
-        GAUDI_SW_VER: ${GAUDI_SW_VER:-1.18.0}
+        GAUDI_SW_VER: ${GAUDI_SW_VER:-1.19.0}
         OS: ${OS:-ubuntu22.04}
-        OPTIMUM_HABANA_VER:  ${OPTIMUM_HABANA_VER:-1.14.0}
-        TORCH_VER: ${TORCH_VER:-2.4.0}
+        OPTIMUM_HABANA_VER:  ${OPTIMUM_HABANA_VER:-1.15.0}
+        TORCH_VER: ${TORCH_VER:-2.5.1}
         REGISTRY: ${REGISTRY}
         REPO: ${REPO}
       context: .
       labels:
-        org.opencontainers.base.name: "vault.habana.ai/gaudi-docker/${GAUDI_SW_VER:-1.18.0}/${OS:-ubuntu22.04}/habanalabs/pytorch-installer-${TORCH_VER:-2.3.1}:latest"
+        org.opencontainers.base.name: "vault.habana.ai/gaudi-docker/${GAUDI_SW_VER:-1.19.0}/${OS:-ubuntu22.04}/habanalabs/pytorch-installer-${TORCH_VER:-2.5.1}:latest"
         org.opencontainers.image.title: "Optimum for Intel® Gaudi® Accelerators"
-        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.13.0}
+        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0}
     command: >
       sh -c "python -c 'from optimum import habana; print(\"optimum-habana:\", habana.__version__)'"
-    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.13.0}
+    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0}
     pull_policy: always
   optimum-habana-examples:
     build:
       labels:
-        org.opencontainers.base.name: "${REGISTRY}/${REPO}:gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.13.0}"
+        org.opencontainers.base.name: "${REGISTRY}/${REPO}:gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0}"
         org.opencontainers.image.title: "Optimum for Intel® Gaudi® Accelerators Examples"
-        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.13.0}
+        org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.15.0}
       target: optimum-habana-examples
     command: >
       sh -c "python -c 'from optimum import habana; print(\"optimum-habana:\", habana.__version__)'"
     extends: optimum-habana
-    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.18.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.13.0}
+    image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.15.0}
 
diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index 9ef27f9e73..5cce1528dc 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -131,60 +131,6 @@ python ../gaudi_spawn.py \
 This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json
 
 
-### Multi-card Training with Deepspeed (chatglm3-6b)
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --config_name THUDM/chatglm3-6b \
-    --tokenizer_name THUDM/chatglm3-6b \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 6 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --deepspeed llama2_ds_zero3_config.json \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --block_size 1024 \
-    --use_cache False \
-    --overwrite_output_dir \
-    --logging_first_step True \
-    --logging_steps 20
-```
-
-### Multi-card Training with Deepspeed (Baichuan2-13B-Chat)
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --config_name baichuan-inc/Baichuan2-13B-Chat \
-    --tokenizer_name baichuan-inc/Baichuan2-13B-Chat \
-    --dataset_name wikitext \
-    --num_train_epochs 30 \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 2 \
-    --do_train \
-    --do_eval \
-    --deepspeed llama2_ds_zero3_config.json \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --block_size 1024 \
-    --use_cache False \
-    --overwrite_output_dir \
-    --logging_first_step True \
-    --logging_steps 20
-```
-
-
 ## Multi-Node Training with Deepspeed (GPT-NeoX)
 
 The following command triggers the fine-tuning of [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) on WikiText-2 with Deepspeed ZeRO-2.
@@ -226,10 +172,11 @@ Following the RoBERTa paper, we use dynamic masking rather than static masking.
 converge slightly slower (over-fitting takes more epochs).
 
 
-### Single-card Training
+### Multi-card Training
 
 ```bash
-python run_mlm.py \
+python ../gaudi_spawn.py \
+    --world_size 8 --use_mpi run_mlm.py \
     --model_name_or_path roberta-base \
     --dataset_name wikitext \
     --dataset_config_name wikitext-2-raw-v1 \
@@ -246,54 +193,12 @@ python run_mlm.py \
     --bf16
 ```
 
-To run on your own training and validation files, use the following command:
-
-```bash
-python run_mlm.py \
-    --model_name_or_path roberta-base \
-    --train_file path_to_train_file \
-    --validation_file path_to_validation_file \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/roberta-base \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
 If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
 concatenates all texts and then splits them into blocks of the same length).
 
 **Note:** On HPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make sure all your batches have the same length.
 
 
-### Multi-card Training
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_mlm.py \
-    --model_name_or_path roberta-base \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/roberta-base \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-
 ### Training in torch.compile mode
 RoBERTa-Large model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command,
 a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`.
@@ -324,78 +229,6 @@ python run_clm.py \
     --bf16
 ```
 
-
-## Using DeepSpeed
-
-Multi-card examples can be simply adapted to be run with DeepSpeed. Here is the CLM example with GPT2-XL:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path gpt2-xl \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --learning_rate 4e-4 \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-Here is another example with Bloom-7B1:
-
-```bash
-DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 PT_HPU_MAX_COMPOUND_OP_SYNC=1 PT_HPU_MAX_COMPOUND_OP_SIZE=1 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path bigscience/bloom-7b1 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 8 \
-    --do_train \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/roberta-base \
-    --use_habana \
-    --use_lazy_mode \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3 \
-    --save_strategy "no" \
-    --learning_rate 1e-04 \
-    --deepspeed path_to_my_deepspeed_config
-```
-[This](https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_3_gaudi1.json) is a DeepSpeed configuration you can use to train this model on Gaudi1.
-
-
 ## Inference
 
 To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
@@ -456,141 +289,6 @@ python3 run_lora_clm.py \
     --validation_split_percentage 4 \
     --adam_epsilon 1e-08
 ```
-- Single-card finetuning of Falcon-40B:
-```bash
-PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 run_lora_clm.py \
-    --model_name_or_path tiiuae/falcon-40b \
-    --dataset_name timdettmers/openassistant-guanaco \
-    --bf16 True \
-    --output_dir ./model_lora_falcon \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --max_grad_norm  0.3 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --logging_steps 1 \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --pipelining_fwd_bwd \
-    --throughput_warmup_steps 3 \
-    --lora_rank=64 \
-    --lora_alpha=16 \
-    --lora_dropout=0.1 \
-    --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \
-    --dataset_concatenation \
-    --max_seq_length 256 \
-    --low_cpu_mem_usage True \
-    --adam_epsilon 1e-08 \
-    --do_eval \
-    --validation_split_percentage 5
-```
-
-- Multi-card finetuning of Llama1-7B:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path huggyllama/llama-7b \
-    --dataset_name tatsu-lab/alpaca \
-    --bf16 True \
-    --output_dir ./model_lora_llama_ddp \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 8 \
-    --gradient_accumulation_steps 2 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --max_grad_norm  0.3 \
-    --logging_steps 1 \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --lora_rank=8 \
-    --lora_alpha=16 \
-    --lora_dropout=0.05 \
-    --lora_target_modules "q_proj" "v_proj" \
-    --dataset_concatenation \
-    --max_seq_length 512 \
-    --ddp_bucket_cap_mb 50 \
-    --adam_epsilon 1e-08 \
-    --validation_split_percentage 4 \
-    --low_cpu_mem_usage True
-```
-
-- Multi-card finetuning of Llama2-7B with FP8:
-```bash
-PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset_name tatsu-lab/alpaca \
-    --bf16 True \
-    --output_dir ./model_lora_llama \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 16 \
-    --gradient_accumulation_steps 1 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "constant" \
-    --max_grad_norm 0.3 \
-    --logging_steps 20 \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 18 \
-    --lora_rank=8 \
-    --lora_alpha=16 \
-    --lora_dropout=0.05 \
-    --lora_target_modules "q_proj" "v_proj" \
-    --dataset_concatenation \
-    --max_seq_length 512 \
-    --ddp_bucket_cap_mb 50 \
-    --adam_epsilon 1e-08 \
-    --validation_split_percentage 10 \
-    --low_cpu_mem_usage True \
-    --pipelining_fwd_bwd \
-    --fp8 True
-```
-
-- Multi-card finetuning of codegen-16B-mono:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path Salesforce/codegen-16B-mono \
-    --dataset_name b-mc2/sql-create-context \
-    --sql_prompt \
-    --bf16 True \
-    --output_dir ./finetuned-models/codegen-finetune-on-sql-create-context-hpu8-lora8-bs4 \
-    --num_train_epochs 5 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 1e-4 \
-    --logging_steps 1 \
-    --dataset_concatenation \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --use_hpu_graphs_for_inference \
-    --lora_target_modules "qkv_proj" \
-    --lora_rank 8 \
-    --do_eval \
-    --validation_split_percentage 10 \
-    --use_cache False
-```
 
 - Multi-card finetuning of gemma2 using chat template:
 ```bash
@@ -740,43 +438,6 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \
   --flash_attention_causal_mask True
 ```
 
-- Multi-card finetuning of Falcon-180B:
-  - Falcon-180B example command saves only the LoRA parameters at end
-  - For inference we need to merge the pretrained model and LoRA weights
-```bash
-PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_lora_clm.py \
-    --model_name_or_path tiiuae/falcon-180B \
-    --dataset_name timdettmers/openassistant-guanaco \
-    --bf16 True \
-    --output_dir ./model_lora_falcon_ddp \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 4e-4 \
-    --max_grad_norm  0.3 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --logging_steps 1 \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --pipelining_fwd_bwd \
-    --throughput_warmup_steps 3 \
-    --lora_rank=64 \
-    --lora_alpha=16 \
-    --lora_dropout=0.1 \
-    --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \
-    --dataset_concatenation \
-    --max_seq_length 256 \
-    --adam_epsilon 1e-08 \
-    --do_eval \
-    --validation_split_percentage 5 \
-    --deepspeed ds_falcon_180b_z3.json
-```
 Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`, or enable ln-tuning using `--peft_type ln_tuning`, or enable vera using `--peft_type vera`.
 
 #### Custom Files
@@ -824,7 +485,7 @@ The format of the text files (with extensions .text or .txt) is expected to be
 ### Prompt/Prefix/P-tuning
 
 To run prompt tuning finetuning, you can use `run_prompt_tuning_clm.py`.
-Here are single-/multi-device command examples for Llama2-7B:
+Here are single-card command examples for Llama2-7B:
 - single-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints":
 ```bash
 python3 run_prompt_tuning_clm.py \
@@ -844,25 +505,6 @@ python3 run_prompt_tuning_clm.py \
     --use_lazy_mode
 ```
 
-- multi-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints":
-```bash
-python3 ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_prompt_tuning_clm.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir prompt_tuning_out \
-    --bf16 True \
-    --report_to=none \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --low_cpu_mem_usage True \
-    --logging_steps 1 \
-    --do_train \
-    --num_train_epochs 50 \
-    --do_eval  \
-    --use_habana  \
-    --use_lazy_mode
-```
 Default `peft_type` is `prompt_tuning`, you could enable prefix-tuning or p-tuning using `--peft_type prefix_tuning` or `--peft_type p_tuning`.
 
 Use the prompt finetuned model for text-generation:
diff --git a/examples/language-modeling/llama3_ds_zero1_config.json b/examples/language-modeling/llama3_ds_zero1_config.json
new file mode 100755
index 0000000000..b04ef0f0a9
--- /dev/null
+++ b/examples/language-modeling/llama3_ds_zero1_config.json
@@ -0,0 +1,13 @@
+{
+    "steps_per_print": 64,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "bf16": {
+        "enabled": true
+    },
+    "gradient_clipping": 1.0,
+    "zero_optimization": {
+        "stage": 1
+    }
+}
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index b97b634941..93d85ba54b 100644
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -156,6 +156,32 @@ class ModelArguments:
             )
         },
     )
+    attn_softmax_bf16: bool = field(
+        default=False,
+        metadata={"help": ("Whether to run attention softmax layer in bf16 precision for fine-tuning.")},
+    )
+    use_flash_attention: bool = field(
+        default=False,
+        metadata={"help": ("Whether to use Habana flash attention for fine-tuning.")},
+    )
+    flash_attention_recompute: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable recompute in Habana flash attention for fine-tuning."
+                " It is applicable only when use_flash_attention is True."
+            )
+        },
+    )
+    flash_attention_causal_mask: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable causal mask in Habana flash attention for fine-tuning."
+                " It is applicable only when use_flash_attention is True."
+            )
+        },
+    )
     low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
@@ -472,7 +498,7 @@ def main():
     else:
         model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
@@ -482,6 +508,14 @@ def main():
         if len(tokenizer) > embedding_size:
             model.resize_token_embeddings(len(tokenizer))
 
+    # We need to add these fused kernels config
+    if model_args.attn_softmax_bf16:
+        model.generation_config.attn_softmax_bf16 = True
+    if model_args.use_flash_attention:
+        model.generation_config.use_flash_attention = True
+        model.generation_config.flash_attention_recompute = model_args.flash_attention_recompute
+        model.generation_config.flash_attention_causal_mask = model_args.flash_attention_causal_mask
+
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:
diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
index 4782ed58ae..3ff7fbfd3a 100644
--- a/examples/language-modeling/run_lora_clm.py
+++ b/examples/language-modeling/run_lora_clm.py
@@ -70,7 +70,7 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 @dataclass
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 30315bfc84..2de43c910b 100644
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_multitask_prompt_tuning.py b/examples/language-modeling/run_multitask_prompt_tuning.py
index 1d81bcc496..9f955db44e 100644
--- a/examples/language-modeling/run_multitask_prompt_tuning.py
+++ b/examples/language-modeling/run_multitask_prompt_tuning.py
@@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risk.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py
index e263c0c1b6..44ea542d14 100644
--- a/examples/language-modeling/run_prompt_tuning_clm.py
+++ b/examples/language-modeling/run_prompt_tuning_clm.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/multi-node-training/EFA/Dockerfile b/examples/multi-node-training/EFA/Dockerfile
index a527f99603..bc6f827164 100644
--- a/examples/multi-node-training/EFA/Dockerfile
+++ b/examples/multi-node-training/EFA/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 
 # Installs pdsh and upgrade pip
 RUN apt-get update && apt-get install -y pdsh && \
@@ -14,11 +14,12 @@ RUN git clone "https://github.com/HabanaAI/hccl_ofi_wrapper.git" "${OFI_WRAPPER_
 RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
    sed -i 's/#   Port 22/    Port 3022/g' /etc/ssh/ssh_config && \
    sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+   /usr/bin/ssh-keygen -A && \
    service ssh restart
 
 # Installs Optimum Habana and Habana's fork of DeepSpeed
 RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 
 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
    chmod 600 ~/.ssh/id_rsa && \
diff --git a/examples/multi-node-training/GaudiNIC/Dockerfile b/examples/multi-node-training/GaudiNIC/Dockerfile
index b3763c4277..5375a6fcc7 100644
--- a/examples/multi-node-training/GaudiNIC/Dockerfile
+++ b/examples/multi-node-training/GaudiNIC/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 
 # Installs pdsh and upgrade pip
 RUN apt-get update && apt-get install -y pdsh && \
@@ -8,11 +8,12 @@ RUN apt-get update && apt-get install -y pdsh && \
 RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
    sed -i 's/#   Port 22/    Port 3022/g' /etc/ssh/ssh_config && \
    sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+   /usr/bin/ssh-keygen -A && \
    service ssh restart
 
 # Installs Optimum Habana and Habana's fork of DeepSpeed
 RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 
 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
    chmod 600 ~/.ssh/id_rsa && \
diff --git a/examples/object-detection/README.md b/examples/object-detection/README.md
index aa82013326..0ce639dc9b 100644
--- a/examples/object-detection/README.md
+++ b/examples/object-detection/README.md
@@ -28,7 +28,3 @@ python3 run_example.py \
 	--bf16 \
 	--print_result
 ```
-
-Models that have been validated:
-  - [facebook/detr-resnet-101](https://huggingface.co/facebook/detr-resnet-101)
-  - [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50)
\ No newline at end of file
diff --git a/examples/object-segementation/README.md b/examples/object-segementation/README.md
index 936180e4f2..2b8728eb56 100644
--- a/examples/object-segementation/README.md
+++ b/examples/object-segementation/README.md
@@ -30,8 +30,6 @@ python3 run_example.py \
     --bf16 \
     --print_result
 ```
-Models that have been validated:
-  - [clipseg-rd64-refined ](https://huggingface.co/CIDAS/clipseg-rd64-refined)
 
 ### Segment Anything Model
 
@@ -45,7 +43,4 @@ python3 run_example_sam.py \
     --use_hpu_graphs \
     --bf16 \
     --print_result
-```
-Models that have been validated:
-  - [facebook/sam-vit-base](https://huggingface.co/facebook/sam-vit-base)
-  - [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge)
\ No newline at end of file
+```
\ No newline at end of file
diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py
index 6941e6e5c1..230d1c61e8 100644
--- a/examples/protein-folding/run_esmfold.py
+++ b/examples/protein-folding/run_esmfold.py
@@ -40,7 +40,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 def convert_outputs_to_pdb(outputs):
diff --git a/examples/protein-folding/run_sequence_classification.py b/examples/protein-folding/run_sequence_classification.py
index dde75a2564..fa35d8b803 100644
--- a/examples/protein-folding/run_sequence_classification.py
+++ b/examples/protein-folding/run_sequence_classification.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
diff --git a/examples/protein-folding/run_zero_shot_eval.py b/examples/protein-folding/run_zero_shot_eval.py
index 3b475883e8..7da135f080 100644
--- a/examples/protein-folding/run_zero_shot_eval.py
+++ b/examples/protein-folding/run_zero_shot_eval.py
@@ -36,7 +36,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logging.basicConfig(
diff --git a/examples/pytorch-image-models/README.md b/examples/pytorch-image-models/README.md
index 8567a77fe6..731e61d612 100644
--- a/examples/pytorch-image-models/README.md
+++ b/examples/pytorch-image-models/README.md
@@ -16,20 +16,7 @@ limitations under the License.
 
 # pyTorch-IMage-Models (TIMM) Examples with HPUs
 
-This directory contains the scripts that showcases how to inference/fine-tune the TIMM models on intel's HPUs with the lazy/graph modes.  We support the trainging for single/multiple HPU cards both two. Currently we support several most downloadable models from Hugging Face as below list.
-
-- [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k)
-- [timm/resnet18.a1_in1k](https://huggingface.co/timm/resnet18.a1_in1k)
-- [timm/resnet18.fb_swsl_ig1b_ft_in1k](https://huggingface.co/timm/resnet18.fb_swsl_ig1b_ft_in1k)
-- [timm/wide_resnet50_2.racm_in1k](https://huggingface.co/timm/wide_resnet50_2.racm_in1k)
-- [timm/efficientnet_b3.ra2_in1k](https://huggingface.co/timm/efficientnet_b3.ra2_in1k)
-- [timm/efficientnet_lite0.ra_in1k](https://huggingface.co/timm/efficientnet_lite0.ra_in1k)
-- [timm/efficientnet_b0.ra_in1k](https://huggingface.co/timm/efficientnet_b0.ra_in1k)
-- [timm/nf_regnet_b1.ra2_in1k](https://huggingface.co/timm/nf_regnet_b1.ra2_in1k)
-- [timm/mobilenetv3_large_100.ra_in1k](https://huggingface.co/timm/mobilenetv3_large_100.ra_in1k)
-- [timm/tf_mobilenetv3_large_minimal_100.in1k](https://huggingface.co/timm/tf_mobilenetv3_large_minimal_100.in1k)
-- [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k](https://huggingface.co/timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)
-- [timm/vgg19.tv_in1k](https://huggingface.co/timm/vgg19.tv_in1k)
+This directory contains the scripts that showcase how to inference/fine-tune the TIMM models on Intel's HPUs with the lazy/graph modes. Training is supported for single/multiple HPU cards. Currently we can support first 10 most downloadable models from [Hugging Face timm link](https://huggingface.co/timm). In our example below for inference/training we will use [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k) as our testing model and same usage for other models. 
 
 ## Requirements
 
@@ -46,72 +33,30 @@ pip install .
 
 Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.co/datasets/johnowhitaker/imagenette2-320) and model with [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k) from Hugging Face.
 
-### Training with HPU lazy mode
-   
-```bash
-python train_hpu_lazy.py \
-    --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
-    --device 'hpu' \
-    --model resnet50.a1_in1k \
-    --train-split train \
-    --val-split train \
-    --dataset-download
-```
-
-python train_hpu_lazy.py --data-dir='./' --dataset hfds/johnowhitaker/imagenette2-320  --device='hpu' --model resnet50.a1_in1k 
 ### Training with HPU graph mode
 
 ```bash
 python train_hpu_graph.py \
     --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device 'hpu' \
     --model resnet50.a1_in1k \
     --train-split train \
     --val-split train \
-    --dataset-download
+    --dataset-download 
 ```
 
-Here the results for lazy mode is shown below for example:
-
-```bash
-Train: 0 [   0/73 (  1%)]  Loss: 6.86 (6.86)  Time: 9.575s,   13.37/s  (9.575s,   13.37/s)  LR: 1.000e-05  Data: 0.844 (0.844)
-Train: 0 [  50/73 ( 70%)]  Loss: 6.77 (6.83)  Time: 0.320s,  400.32/s  (0.470s,  272.39/s)  LR: 1.000e-05  Data: 0.217 (0.047)
-Test: [   0/30]  Time: 6.593 (6.593)  Loss:   6.723 ( 6.723)  Acc@1:   0.000 (  0.000)  Acc@5:   0.000 (  0.000)
-Test: [  30/30]  Time: 3.856 (0.732)  Loss:   6.615 ( 6.691)  Acc@1:   0.000 (  0.076)  Acc@5:   1.176 (  3.287)
-
-Train: 1 [   0/73 (  1%)]  Loss: 6.69 (6.69)  Time: 0.796s,  160.74/s  (0.796s,  160.74/s)  LR: 1.001e-02  Data: 0.685 (0.685)
-Train: 1 [  50/73 ( 70%)]  Loss: 3.23 (3.76)  Time: 0.160s,  798.85/s  (0.148s,  863.22/s)  LR: 1.001e-02  Data: 0.053 (0.051)
-Test: [   0/30]  Time: 0.663 (0.663)  Loss:   1.926 ( 1.926)  Acc@1:  46.094 ( 46.094)  Acc@5:  85.938 ( 85.938)
-Test: [  30/30]  Time: 0.022 (0.126)  Loss:   1.462 ( 1.867)  Acc@1:  63.529 ( 39.261)  Acc@5:  83.529 ( 85.096)
-
-```
-
-
 ## Multi-HPU training
 
 Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.co/datasets/johnowhitaker/imagenette2-320) and model with [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k) from Hugging Face.
 
-### Training with HPU lazy mode
-```bash
-torchrun --nnodes 1 --nproc_per_node 2 \
-    train_hpu_lazy.py \
-    --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
-    --device 'hpu' \
-    --model resnet50.a1_in1k \
-    --train-split train \
-    --val-split train \
-    --dataset-download
-```
 ### Training with HPU graph mode
 
 ```bash
 torchrun --nnodes 1 --nproc_per_node 2 \
     train_hpu_graph.py \
     --data-dir ./ \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device 'hpu' \
     --model resnet50.a1_in1k \
     --train-split train \
@@ -119,20 +64,6 @@ torchrun --nnodes 1 --nproc_per_node 2 \
     --dataset-download
 ```
 
-Here the results for lazy mode is shown below for example:
-
-```bash
-Train: 0 [   0/36 (  3%)]  Loss: 6.88 (6.88)  Time: 10.036s,   25.51/s  (10.036s,   25.51/s)  LR: 1.000e-05  Data: 0.762 (0.762)
-Test: [   0/15]  Time: 7.796 (7.796)  Loss:   6.915 ( 6.915)  Acc@1:   0.000 (  0.000)  Acc@5:   0.000 (  0.000)
-Test: [  15/15]  Time: 1.915 (1.263)  Loss:   6.847 ( 6.818)  Acc@1:   0.000 (  0.000)  Acc@5:   0.000 (  0.688)
-
-Train: 1 [   0/36 (  3%)]  Loss: 6.84 (6.84)  Time: 6.687s,   38.28/s  (6.687s,   38.28/s)  LR: 2.001e-02  Data: 0.701 (0.701)
-Test: [   0/15]  Time: 1.315 (1.315)  Loss:   2.463 ( 2.463)  Acc@1:  14.062 ( 14.062)  Acc@5:  48.828 ( 48.828)
-Test: [  15/15]  Time: 0.020 (0.180)  Loss:   1.812 ( 1.982)  Acc@1:  52.326 ( 32.934)  Acc@5:  66.279 ( 75.064)
-
-```
-
-
 
 ## Single-HPU inference
 
@@ -142,22 +73,13 @@ Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.
 ```bash
 python inference.py \
     --data-dir='./' \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
+    --dataset hfds/johnowhitaker/imagenette2-320 \
     --device='hpu' \
     --model resnet50.a1_in1k \
     --split train \
     --graph_mode
 ```
 
-### HPU with lazy mode
-```bash
-python inference.py \
-    --data-dir='./' \
-    --dataset hfds/johnowhitaker/imagenette2-320 \    
-    --device='hpu' \
-    --model resnet50.a1_in1k \
-    --split train
-```
 
 
 
diff --git a/examples/pytorch-image-models/train_hpu_graph.py b/examples/pytorch-image-models/train_hpu_graph.py
index 767142e469..c9d0974258 100755
--- a/examples/pytorch-image-models/train_hpu_graph.py
+++ b/examples/pytorch-image-models/train_hpu_graph.py
@@ -136,6 +136,12 @@
     metavar="PATH",
     help="Load this checkpoint into model after initialization (default: none)",
 )
+group.add_argument(
+    "--save_checkpoint",
+    action="store_true",
+    default=False,
+    help="saving checkpoint for each epoch",
+)
 group.add_argument(
     "--resume",
     default="",
@@ -635,10 +641,6 @@ def _parse_args():
     return args, args_text
 
 
-def setup():
-    dist.init_process_group(backend="hccl")
-
-
 def cleanup():
     dist.destroy_process_group()
 
@@ -663,8 +665,6 @@ def main():
         device = torch.device("hpu")
 
     if args.distributed:
-        setup()
-
         _logger.info(
             "Training in distributed mode with multiple processes, 1 device per process."
             f"Process {args.rank}, total {args.world_size}, device {args.device}."
@@ -1054,17 +1054,18 @@ def main():
                 ]
             )
         output_dir = utils.get_outdir(args.output if args.output else "./output/train", exp_name)
-        saver = utils.CheckpointSaver(
-            model=model,
-            optimizer=optimizer,
-            args=args,
-            model_ema=model_ema,
-            amp_scaler=loss_scaler,
-            checkpoint_dir=output_dir,
-            recovery_dir=output_dir,
-            decreasing=decreasing_metric,
-            max_history=args.checkpoint_hist,
-        )
+        if args.save_checkpoint:
+            saver = utils.CheckpointSaver(
+                model=model,
+                optimizer=optimizer,
+                args=args,
+                model_ema=model_ema,
+                amp_scaler=loss_scaler,
+                checkpoint_dir=output_dir,
+                recovery_dir=output_dir,
+                decreasing=decreasing_metric,
+                max_history=args.checkpoint_hist,
+            )
         with open(os.path.join(output_dir, "args.yaml"), "w") as f:
             f.write(args_text)
 
@@ -1098,7 +1099,7 @@ def main():
 
     if utils.is_primary(args):
         _logger.info(
-            f'Scheduled epochs: {num_epochs}. LR stepped per {"epoch" if lr_scheduler.t_in_epochs else "update"}.'
+            f"Scheduled epochs: {num_epochs}. LR stepped per {'epoch' if lr_scheduler.t_in_epochs else 'update'}."
         )
 
     results = []
@@ -1330,7 +1331,7 @@ def _backward(_loss):
             if utils.is_primary(args):
                 _logger.info(
                     f"Train: {epoch} [{update_idx:>4d}/{updates_per_epoch} "
-                    f"({100. * (update_idx + 1) / updates_per_epoch:>3.0f}%)]  "
+                    f"({100.0 * (update_idx + 1) / updates_per_epoch:>3.0f}%)]  "
                     f"Loss: {losses_m.val:#.3g} ({losses_m.avg:#.3g})  "
                     f"Time: {update_time_m.val:.3f}s, {update_sample_count / update_time_m.val:>7.2f}/s  "
                     f"({update_time_m.avg:.3f}s, {update_sample_count / update_time_m.avg:>7.2f}/s)  "
diff --git a/examples/pytorch-image-models/train_hpu_lazy.py b/examples/pytorch-image-models/train_hpu_lazy.py
index 834f9ce043..17f1dac0d9 100755
--- a/examples/pytorch-image-models/train_hpu_lazy.py
+++ b/examples/pytorch-image-models/train_hpu_lazy.py
@@ -138,6 +138,12 @@
     metavar="PATH",
     help="Load this checkpoint into model after initialization (default: none)",
 )
+group.add_argument(
+    "--save_checkpoint",
+    action="store_true",
+    default=False,
+    help="saving checkpoint for each epoch",
+)
 group.add_argument(
     "--resume",
     default="",
@@ -637,10 +643,6 @@ def _parse_args():
     return args, args_text
 
 
-def setup():
-    dist.init_process_group(backend="hccl")
-
-
 def cleanup():
     dist.destroy_process_group()
 
@@ -665,8 +667,6 @@ def main():
         device = torch.device("hpu")
 
     if args.distributed:
-        setup()
-
         _logger.info(
             "Training in distributed mode with multiple processes, 1 device per process."
             f"Process {args.rank}, total {args.world_size}, device {args.device}."
@@ -1053,17 +1053,18 @@ def main():
                 ]
             )
         output_dir = utils.get_outdir(args.output if args.output else "./output/train", exp_name)
-        saver = utils.CheckpointSaver(
-            model=model,
-            optimizer=optimizer,
-            args=args,
-            model_ema=model_ema,
-            amp_scaler=loss_scaler,
-            checkpoint_dir=output_dir,
-            recovery_dir=output_dir,
-            decreasing=decreasing_metric,
-            max_history=args.checkpoint_hist,
-        )
+        if args.save_checkpoint:
+            saver = utils.CheckpointSaver(
+                model=model,
+                optimizer=optimizer,
+                args=args,
+                model_ema=model_ema,
+                amp_scaler=loss_scaler,
+                checkpoint_dir=output_dir,
+                recovery_dir=output_dir,
+                decreasing=decreasing_metric,
+                max_history=args.checkpoint_hist,
+            )
         with open(os.path.join(output_dir, "args.yaml"), "w") as f:
             f.write(args_text)
 
@@ -1097,7 +1098,7 @@ def main():
 
     if utils.is_primary(args):
         _logger.info(
-            f'Scheduled epochs: {num_epochs}. LR stepped per {"epoch" if lr_scheduler.t_in_epochs else "update"}.'
+            f"Scheduled epochs: {num_epochs}. LR stepped per {'epoch' if lr_scheduler.t_in_epochs else 'update'}."
         )
 
     results = []
@@ -1331,7 +1332,7 @@ def _backward(_loss):
             if utils.is_primary(args):
                 _logger.info(
                     f"Train: {epoch} [{update_idx:>4d}/{updates_per_epoch} "
-                    f"({100. * (update_idx + 1) / updates_per_epoch:>3.0f}%)]  "
+                    f"({100.0 * (update_idx + 1) / updates_per_epoch:>3.0f}%)]  "
                     f"Loss: {losses_m.val:#.3g} ({losses_m.avg:#.3g})  "
                     f"Time: {update_time_m.val:.3f}s, {update_sample_count / update_time_m.val:>7.2f}/s  "
                     f"({update_time_m.avg:.3f}s, {update_sample_count / update_time_m.avg:>7.2f}/s)  "
diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
index bf6cd04aec..d7a83ea5c8 100755
--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -33,171 +33,6 @@ First, you should install the requirements:
 pip install -r requirements.txt
 ```
 
-## Fine-tuning BERT on SQuAD1.1
-
-For the following cases, an example of a Gaudi configuration file is given
-[here](https://github.com/huggingface/optimum-habana#how-to-use-it).
-
-
-### Single-card Training
-
-This example code fine-tunes BERT on the SQuAD1.1 dataset.
-
-```bash
-python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 32 \
-  --per_device_eval_batch_size 8 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --throughput_warmup_steps 3 \
-  --bf16 \
-  --sdp_on_bf16
-```
-
-For torch.compile mode,
-```bash
-PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 32 \
-  --per_device_eval_batch_size 8 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --torch_compile_backend hpu_backend \
-  --torch_compile \
-  --use_lazy_mode false \
-  --throughput_warmup_steps 3 \
-  --bf16 \
-  --sdp_on_bf16
-```
-
-### Multi-card Training
-
-Here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using the `run_qa` script, with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --sdp_on_bf16
-```
-
-For torch.compile mode,
-```bash
-PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --torch_compile_backend hpu_backend \
-    --torch_compile \
-    --use_lazy_mode false \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --sdp_on_bf16
-```
-
-
-### Using DeepSpeed
-
-Similarly to multi-card training, here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using DeepSpeed with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config \
-    --sdp_on_bf16
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-
-### Training in torch.compile mode
-
-Albert XXL model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, \
-a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. \
-b) Run the above commands with `--model_name_or_path albert-xxlarge-v1`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags.
-
-
 ## Fine-tuning Llama on SQuAD1.1
 
 > [!NOTE]
@@ -207,7 +42,7 @@ Here is a command you can run to train a Llama model for question answering:
 ```bash
 python ../gaudi_spawn.py \
   --world_size 8 --use_deepspeed run_qa.py \
-  --model_name_or_path FlagAlpha/Llama2-Chinese-13b-Chat \
+  --model_name_or_path meta-llama/Llama-2-7b-chat-hf \
   --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
   --dataset_name squad \
   --do_train \
@@ -232,76 +67,3 @@ python ../gaudi_spawn.py \
 ## Inference
 
 To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with BERT on SQuAD on 1 Gaudi card with the following command:
-```bash
-python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_eval \
-  --per_device_eval_batch_size 8 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --bf16 \
-  --sdp_on_bf16
-```
-
-
-## Recommended Hyperparameters for Mixed Precision
-
-| | learning_rate | num_train_epochs | per_device_train_batch_size | per_device_eval_batch_size |
-|----------------------------|:----:|:--:|:-:|:-:|
-| BERT base                  | 3e-5 | 2 | 24 | 8 |
-| BERT large                 | 3e-5 | 2 | 24 | 8 |
-| RoBERTa base               | 3e-5 | 2 | 12 | 8 |
-| RoBERTa large              | 3e-5 | 2 | 12 | 8 |
-| ALBERT large (single-card) | 5e-5 | 2 | 32 | 4 |
-| ALBERT large (multi-card)  | 6e-5 | 2 | 32 | 4 |
-| ALBERT XXL (single-card)   | 5e-6 | 2 | 16 | 2 |
-| ALBERT XXL (multi-card)    | 5e-5 | 2 | 16 | 2 |
-| DistilBERT                 | 5e-5 | 3 | 8  | 8 |
-| meta-llama/Llama-2-13b-chat-hf (multi-card) | 3e-5 | 2 | 8 | 8 |
-| FlagAlpha/Llama2-Chinese-13b-Chat (multi-card) | 3e-5 | 2 | 8 | 8 |
-
-
-## Fine-tuning T5 on SQuAD2.0
-
-The [`run_seq2seq_qa.py`](https://github.com/huggingface/optimum-habana/blob/main/examples/question-answering/run_seq2seq_qa.py) script is meant for encoder-decoder (also called seq2seq) Transformer models, such as T5 or BART. These models are generative, rather than discriminative. This means that they learn to generate the correct answer, rather than predicting the start and end position of the tokens of the answer.
-
-The following command fine-tunes T5 on the SQuAD2.0 dataset:
-
-```bash
-python run_seq2seq_qa.py \
-  --model_name_or_path t5-small \
-  --gaudi_config_name Habana/t5 \
-  --dataset_name squad_v2 \
-  --version_2_with_negative \
-  --context_column context \
-  --question_column question \
-  --answer_column answers \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 16 \
-  --per_device_eval_batch_size 33 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/seq2seq_squad/ \
-  --predict_with_generate \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --ignore_pad_token_for_loss False \
-  --pad_to_max_length \
-  --save_strategy epoch \
-  --throughput_warmup_steps 3 \
-  --bf16
-```
-
-For multi-card and DeepSpeed runs, you can use `python ../gaudi_spawn.py --world_size 8 --use_mpi` and `python ../gaudi_spawn.py --world_size 8 --use_deepspeed` as shown in the previous sections.
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
index d22949c076..5ad77be381 100644
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
index 1f045552bd..aaadbee417 100644
--- a/examples/question-answering/run_seq2seq_qa.py
+++ b/examples/question-answering/run_seq2seq_qa.py
@@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md
index 4d5eb69b91..fe80cf775f 100644
--- a/examples/speech-recognition/README.md
+++ b/examples/speech-recognition/README.md
@@ -85,6 +85,7 @@ python run_speech_recognition_ctc.py \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
     --throughput_warmup_steps="3" \
+    --sdp_on_bf16 \
     --bf16 \
     --use_hpu_graphs_for_training \
     --use_hpu_graphs_for_inference \
@@ -128,6 +129,7 @@ python ../gaudi_spawn.py \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
     --bf16 \
+    --sdp_on_bf16 \
     --use_hpu_graphs_for_training \
     --use_hpu_graphs_for_inference \
     --sdp_on_bf16
@@ -143,7 +145,7 @@ On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **
 
 > You need to install DeepSpeed with:
 > ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 > ```
 
 DeepSpeed can be used with almost the same command as for a multi-card run:
@@ -210,6 +212,7 @@ python run_speech_recognition_ctc.py \
     --use_habana \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
+    --sdp_on_bf16 \
     --bf16 \
     --use_hpu_graphs_for_inference \
     --sdp_on_bf16
@@ -250,6 +253,7 @@ python run_speech_recognition_seq2seq.py \
     --max_duration_in_seconds="30" \
     --text_column_name="sentence" \
     --freeze_feature_encoder="False" \
+    --sdp_on_bf16 \
     --bf16 \
     --overwrite_output_dir \
     --do_train \
@@ -259,7 +263,8 @@ python run_speech_recognition_seq2seq.py \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
     --dataloader_num_workers 8 \
-    --throughput_warmup_steps 3
+    --throughput_warmup_steps 3 \
+    --sdp_on_bf16
 ```
 
 If training on a different language, you should be sure to change the `language` argument. The `language` and `task` arguments should be omitted for English speech recognition.
@@ -289,6 +294,7 @@ python ../gaudi_spawn.py \
     --max_duration_in_seconds="30" \
     --text_column_name="sentence" \
     --freeze_feature_encoder="False" \
+    --sdp_on_bf16 \
     --bf16 \
     --overwrite_output_dir \
     --do_train \
@@ -322,6 +328,7 @@ python run_speech_recognition_seq2seq.py \
     --max_duration_in_seconds="30" \
     --text_column_name="sentence" \
     --freeze_feature_encoder="False" \
+    --sdp_on_bf16 \
     --bf16 \
     --overwrite_output_dir \
     --do_eval \
@@ -329,5 +336,6 @@ python run_speech_recognition_seq2seq.py \
     --use_habana \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
-    --dataloader_num_workers 8
+    --dataloader_num_workers 8 \
+    --sdp_on_bf16
 ```
diff --git a/examples/speech-recognition/requirements.txt b/examples/speech-recognition/requirements.txt
index 6bdf66fe01..b7c33c8ba1 100644
--- a/examples/speech-recognition/requirements.txt
+++ b/examples/speech-recognition/requirements.txt
@@ -1,4 +1,5 @@
-datasets >= 1.18.0
+datasets >= 1.18.0, <= 2.19.2
+numba==0.60.0
 librosa
 jiwer
 evaluate
diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py
index 83865556d1..f5da991dbf 100644
--- a/examples/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/speech-recognition/run_speech_recognition_ctc.py
@@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
@@ -504,7 +504,7 @@ def main():
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
     chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None
+        f"[{''.join(data_args.chars_to_ignore).replace(' ', '')}]" if data_args.chars_to_ignore is not None else None
     )
     text_column_name = data_args.text_column_name
 
diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py
index ff9702e80c..db25b852eb 100755
--- a/examples/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py
@@ -56,7 +56,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
index f4df474f09..98e818a5c6 100644
--- a/examples/stable-diffusion/README.md
+++ b/examples/stable-diffusion/README.md
@@ -16,10 +16,10 @@ limitations under the License.
 
 # Stable Diffusion Examples
 
-This directory contains a script that showcases how to perform text-to-image generation using Stable Diffusion on Intel® Gaudi® AI Accelerators.
-
-Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser and Robin Rombach and the Stability AI team.
+This directory contains sample scripts demonstrating how to perform diffusion-based generative tasks on Intel® Gaudi® AI Accelerators.
 
+Stable Diffusion was introduced in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser,
+Robin Rombach and the Stability AI team.
 
 ## Requirements
 
@@ -28,11 +28,11 @@ First, you should install the requirements:
 pip install -r requirements.txt
 ```
 
-## Text-to-image Generation
+## Text-to-Image Generation
 
-### Single Prompt
+### Stable Diffusion
 
-Here is how to generate images with one prompt:
+Here's how to generate images using the Stable Diffusion 1.4 model with a single prompt:
 
 ```bash
 python text_to_image_generation.py \
@@ -44,16 +44,16 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
+> [!NOTE]
 > HPU graphs are recommended when generating images by batches to get the fastest possible generations.
 > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
 > You can enable this mode with `--use_hpu_graphs`.
 
-### Multiple Prompts
-
-Here is how to generate images with several prompts:
+To generate images with multiple prompts, simply include two prompts in your input as shown below:
 
 ```bash
 python text_to_image_generation.py \
@@ -65,12 +65,11 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-### Distributed inference with multiple HPUs
-
-Here is how to generate images with two prompts on two HPUs:
+Distributed inference with multiple HPUs is also supported. Below is an example demonstrating how to generate images with two prompts on two HPUs:
 
 ```bash
 python ../gaudi_spawn.py \
@@ -83,17 +82,23 @@ python ../gaudi_spawn.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --distributed
 ```
 
+> [!NOTE]
 > HPU graphs are recommended when generating images by batches to get the fastest possible generations.
 > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
 > You can enable this mode with `--use_hpu_graphs`.
 
+You can run other older Stable Diffusion models in a similar manner. For example, to generate images with Stable Diffusion 1.5, use the option:
+`--model_name_or_path stable-diffusion-v1-5/stable-diffusion-v1-5`. Examples showcasing Stable Diffusion 2 are provided next.
+
 ### Stable Diffusion 2
 
-[Stable Diffusion 2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_2) can also be used to generate images with this script. Here is an example for a single prompt:
+[Stable Diffusion 2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_2) can also be used
+to generate images with this script. Here is an example demonstrating image generation with a single prompt:
 
 ```bash
 python text_to_image_generation.py \
@@ -107,20 +112,22 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion-2 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
+> [!NOTE]
 > There are two different checkpoints for Stable Diffusion 2:
->
 > - use [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) for generating 768x768 images
 > - use [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) for generating 512x512 images
 
 ### Latent Diffusion Model for 3D (LDM3D)
 
-[LDM3D](https://arxiv.org/abs/2305.10853) generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts.
+[LDM3D](https://arxiv.org/abs/2305.10853) generates both image and depth map data from a given text prompt, allowing users
+to generate RGBD images from text prompts.
 
-[Original checkpoint](https://huggingface.co/Intel/ldm3d) and [latest checkpoint](https://huggingface.co/Intel/ldm3d-4c) are open source.
-A [demo](https://huggingface.co/spaces/Intel/ldm3d) is also available. Here is how to run this model:
+[Original checkpoint](https://huggingface.co/Intel/ldm3d) and [latest checkpoint](https://huggingface.co/Intel/ldm3d-4c)
+are open source. A [demo](https://huggingface.co/spaces/Intel/ldm3d) is also available. Here is how to run this model:
 
 ```bash
 python text_to_image_generation.py \
@@ -134,15 +141,13 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion-2 \
-    --ldm3d \
-    --bf16
+    --ldm3d
 ```
 
 Here is how to generate images and depth maps with two prompts on two HPUs:
 
 ```bash
-python ../gaudi_spawn.py \
-    --world_size 2 text_to_image_generation.py \
+python ../gaudi_spawn.py --world_size 2 text_to_image_generation.py \
     --model_name_or_path "Intel/ldm3d-4c" \
     --prompts "An image of a squirrel in Picasso style" "A shiny flying horse taking off" \
     --num_images_per_prompt 10 \
@@ -157,15 +162,16 @@ python ../gaudi_spawn.py \
     --distributed
 ```
 
+> [!NOTE]
 > There are three different checkpoints for LDM3D:
->
 > - use [original checkpoint](https://huggingface.co/Intel/ldm3d) to generate outputs from the paper
 > - use [the latest checkpoint](https://huggingface.co/Intel/ldm3d-4c) for generating improved results
 > - use [the pano checkpoint](https://huggingface.co/Intel/ldm3d-pano) to generate panoramic view
 
 ### Stable Diffusion XL (SDXL)
 
-Stable Diffusion XL was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/pdf/2307.01952.pdf) by the Stability AI team.
+Stable Diffusion XL was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/pdf/2307.01952.pdf)
+by the Stability AI team.
 
 Here is how to generate SDXL images with a single prompt:
 
@@ -175,37 +181,22 @@ python text_to_image_generation.py \
     --prompts "Sailing ship painting by Van Gogh" \
     --num_images_per_prompt 28 \
     --batch_size 7 \
+    --num_inference_steps 30 \
     --image_save_dir /tmp/stable_diffusion_xl_images \
     --scheduler euler_discrete \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
+> [!NOTE]
 > HPU graphs are recommended when generating images by batches to get the fastest possible generations.
 > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
 > You can enable this mode with `--use_hpu_graphs`.
 
-Here is how to generate SDXL images with several prompts:
-
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-    --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \
-    --num_images_per_prompt 32 \
-    --batch_size 8 \
-    --image_save_dir /tmp/stable_diffusion_xl_images \
-    --scheduler euler_discrete \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-SDXL combines a second text encoder (OpenCLIP ViT-bigG/14) with the original text encoder to significantly
-increase the number of parameters. Here is how to generate images with several prompts for both `prompt`
-and `prompt_2` (2nd text encoder), as well as their negative prompts:
+SDXL integrates a second text encoder (OpenCLIP ViT-bigG/14), alongside the original Stable Diffusion text encoder. This addition significantly increases the number of parameters, enabling more detailed and descriptive prompts. Below is an example of how to generate images using multiple prompts for both `prompt` (primary text encoder) and `prompt_2` (secondary text encoder), along with their respective negative prompts:
 
 ```bash
 python text_to_image_generation.py \
@@ -221,14 +212,14 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-Here is how to generate SDXL images with two prompts on two HPUs:
+SDXL also supports distributed inferencing with Intel Gaudi accelerators. Below is an example of generating SDXL images in a distributed manner using two prompts on two HPUs:
 
 ```bash
-python ../gaudi_spawn.py \
-    --world_size 2 text_to_image_generation.py \
+python ../gaudi_spawn.py --world_size 2 text_to_image_generation.py \
     --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
     --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \
     --prompts_2 "Red tone" "Blue tone" \
@@ -241,29 +232,18 @@ python ../gaudi_spawn.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --distributed
 ```
 
-Here is how to generate SDXL images with optimized pipeline:
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-    --prompts "Sailing ship painting by Van Gogh" \
-    --num_images_per_prompt 28 \
-    --batch_size 7 \
-    --image_save_dir /tmp/stable_diffusion_xl_images \
-    --scheduler euler_discrete \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16 \
-    --optimize
-```
+The performance-optimized SDXL pipeline can be enabled using the `--optimize` option. This option utilizes a more aggressively optimized attention mechanism for enhanced performance. Additionally, it supports running
+inference in mixed FP8 precision.
 
-Here is how to generate SDXL images with optimized pipeline in fp8:
+Here is how to generate SDXL images with optimized pipeline in FP8 precision:
 ```bash
-QUANT_CONFIG=./quantization/quant_config.json python text_to_image_generation.py \
+QUANT_CONFIG=quantization/stable-diffusion-xl/quantize_config.json \
+python text_to_image_generation.py \
     --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
     --prompts "Sailing ship painting by Van Gogh" \
     --num_images_per_prompt 28 \
@@ -273,17 +253,16 @@ QUANT_CONFIG=./quantization/quant_config.json python text_to_image_generation.py
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --optimize
 ```
 
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
-
 ### SDXL-Turbo
 
-SDXL-Turbo is a distilled version of SDXL 1.0, trained for real-time synthesis.
+The knowledge distillation technique can be used to train a distilled version of SDXL, allowing for high-quality
+image generation with fewer inference steps. SDXL-Turbo is a distilled version of Stable Diffusion XL 1.0,
+optimized for real-time synthesis.
 
 Here is how to generate images with multiple prompts:
 
@@ -298,17 +277,16 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --num_inference_steps 1 \
     --guidance_scale 1.000001 \
     --timestep_spacing trailing
 ```
 
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
-
-> Note: there is a regression with "--guidance_scale 0.0" in current release which will be addressed in later releases. Setting `--guidance_scale` to a value larger than 1 resolves the regression.
+> [!WARNING]
+> There is a regression with `--guidance_scale 0.0` in current release which will be addressed in later releases.
+> Setting `--guidance_scale` to a value larger than 1 resolves the regression.
 
 ### Stable Diffusion 3 (SD3)
 
@@ -327,7 +305,6 @@ huggingface-cli login
 Here is how to generate SD3 images with a single prompt:
 
 ```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=1 \
 python text_to_image_generation.py \
     --model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers \
     --prompts "Sailing ship painting by Van Gogh" \
@@ -339,17 +316,57 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-> For improved performance of the SD3 pipeline on Gaudi, it is recommended to configure the environment
-> by setting PT_HPU_MAX_COMPOUND_OP_SIZE to 1.
+This model can also be quantized with some ops running in FP8 precision.
+
+Before quantization, run stats collection using measure mode:
+
+```bash
+QUANT_CONFIG=quantization/stable-diffusion-3/measure_config.json \
+python text_to_image_generation.py \
+    --model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers \
+    --prompts "Sailing ship painting by Van Gogh" \
+    --num_images_per_prompt 10 \
+    --batch_size 1 \
+    --num_inference_steps 28 \
+    --image_save_dir /tmp/stable_diffusion_3_images \
+    --scheduler default \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16 \
+    --quant_mode measure
+```
+
+After stats collection, here is how to run SD3 in quantization mode:
+
+```bash
+QUANT_CONFIG=quantization/stable-diffusion-3/quantize_config.json \
+python text_to_image_generation.py \
+    --model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers \
+    --prompts "Sailing ship painting by Van Gogh" \
+    --num_images_per_prompt 10 \
+    --batch_size 1 \
+    --num_inference_steps 28 \
+    --image_save_dir /tmp/stable_diffusion_3_images \
+    --scheduler default \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16 \
+    --quant_mode quantize
+```
 
 ### FLUX.1
 
 FLUX.1 was introduced by Black Forest Labs [here](https://blackforestlabs.ai/announcing-black-forest-labs/).
 
-Here is how to run FLUX.1-schnell model (fast version of FLUX.1):
+Here is how to run FLUX.1-schnell model (distilled fast version of FLUX.1):
 
 ```bash
 python text_to_image_generation.py \
@@ -359,10 +376,11 @@ python text_to_image_generation.py \
     --batch_size 1 \
     --num_inference_steps 4 \
     --image_save_dir /tmp/flux_1_images \
-    --scheduler flow_match_euler_discrete\
+    --scheduler flow_match_euler_discrete \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -385,10 +403,11 @@ python text_to_image_generation.py \
     --batch_size 1 \
     --num_inference_steps 30 \
     --image_save_dir /tmp/flux_1_images \
-    --scheduler flow_match_euler_discrete\
+    --scheduler flow_match_euler_discrete \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -405,10 +424,11 @@ python text_to_image_generation.py \
     --batch_size 1 \
     --num_inference_steps 30 \
     --image_save_dir /tmp/flux_1_images \
-    --scheduler flow_match_euler_discrete\
+    --scheduler flow_match_euler_discrete \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --quant_mode measure
 ```
@@ -424,43 +444,28 @@ python text_to_image_generation.py \
     --batch_size 1 \
     --num_inference_steps 30 \
     --image_save_dir /tmp/flux_1_images \
-    --scheduler flow_match_euler_discrete\
+    --scheduler flow_match_euler_discrete \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --quant_mode quantize
 ```
 
 ## ControlNet
 
-ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
-It is a type of model for controlling StableDiffusion by conditioning the model with an additional input image.
 
-Here is how to generate images conditioned by canny edge model:
+ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543)
+by Lvmin Zhang and Maneesh Agrawala, enables conditioning the Stable Diffusion model with an additional input image. This allows for precise control over the composition of generated images using various features such as edges, pose, depth, and more.
 
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path CompVis/stable-diffusion-v1-4 \
-    --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --prompts "futuristic-looking woman" \
-    --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
-    --num_images_per_prompt 28 \
-    --batch_size 7 \
-    --image_save_dir /tmp/controlnet_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-Here is how to generate images conditioned by canny edge model and with multiple prompts:
+Here is how to generate images conditioned by Canny edge model:
 
 ```bash
 python text_to_image_generation.py \
     --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --prompts "futuristic-looking woman" "a rusty robot" \
+    --prompts "futuristic-looking woman" \
     --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
     --num_images_per_prompt 28 \
     --batch_size 7 \
@@ -468,14 +473,15 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-Here is how to generate images conditioned by canny edge model and with two prompts on two HPUs:
+The ControlNet example can be run with multiple prompts by supplying more than one prompt in the input.
+Additionally, it supports distributed execution. Below is an example of generating images conditioned by the Canny edge model using two prompts on two HPUs:
 
 ```bash
-python ../gaudi_spawn.py \
-    --world_size 2 text_to_image_generation.py \
+python ../gaudi_spawn.py --world_size 2 text_to_image_generation.py \
     --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
     --prompts "futuristic-looking woman" "a rusty robot" \
@@ -486,46 +492,12 @@ python ../gaudi_spawn.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --distributed
 ```
 
-Here is how to generate images conditioned by open pose model:
-
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path CompVis/stable-diffusion-v1-4 \
-    --controlnet_model_name_or_path lllyasviel/sd-controlnet-openpose \
-    --prompts "Chef in the kitchen" \
-    --control_image https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png \
-    --control_preprocessing_type "none" \
-    --num_images_per_prompt 28 \
-    --batch_size 7 \
-    --image_save_dir /tmp/controlnet_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-Here is how to generate images with conditioned by canny edge model using Stable Diffusion 2
-
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-2-1 \
-    --controlnet_model_name_or_path thibaud/controlnet-sd21-canny-diffusers \
-    --control_image https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png \
-    --control_preprocessing_type "none" \
-    --prompts "bird" \
-    --seed 0 \
-    --num_images_per_prompt 28 \
-    --batch_size 7 \
-    --image_save_dir /tmp/controlnet-2-1_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion-2 \
-    --bf16
-```
+These ControlNet examples will preprocess the input image to derive Canny edges. Alternatively, you can use `--control_preprocessing_type none` to supply a preprocessed control image directly, enabling many additional use cases.
 
 ## Inpainting
 
@@ -547,6 +519,7 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -554,7 +527,7 @@ python text_to_image_generation.py \
 
 ```bash
 python text_to_image_generation.py \
-    --model_name_or_path  diffusers/stable-diffusion-xl-1.0-inpainting-0.1\
+    --model_name_or_path  diffusers/stable-diffusion-xl-1.0-inpainting-0.1 \
     --base_image https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png \
     --mask_image https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png \
     --prompts "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" \
@@ -566,46 +539,110 @@ python text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-## Image-to-image Generation
+## Additional Stable Diffusion-based Inference Techniques
+
+This section provides examples of additional inference techniques based on Stable Diffusion. For more details, please refer to
+[Hugging Face Diffusers documentation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/overview_techniques).
 
-### Single Prompt
+### Unconditional Image Generation
 
-Here is how to generate images with one prompt and one image.
-Take instruct-pix2pix as an example.
+Here is how to perform unconditional image generation on Intel Gaudi. For more details,  please refer to the 
+[Unconditional Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/unconditional_image_generation)
+section in the Hugging Face documentation.
 
 ```bash
-python image_to_image_generation.py \
-    --model_name_or_path "timbrooks/instruct-pix2pix" \
-    --src_image_path "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/main/imgs/example.jpg" \
-    --prompts "turn him into cyborg" \
-    --num_images_per_prompt 20 \
-    --batch_size 4 \
-    --guidance_scale 7.5 \
-    --image_guidance_scale 1 \
-    --num_inference_steps 10 \
-    --image_save_dir /tmp/stable_diffusion_images \
+python unconditional_image_generation.py \
+    --model_name_or_path "google/ddpm-ema-celebahq-256" \
+    --batch_size 16 \
     --use_habana \
+    --use_gaudi_ddim_scheduler \
     --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16 \
+    --save_outputs \
+    --output_dir "/tmp/"
+```
+
+### Controlling Brightness
+
+Here is an example of how to control brightness. For more information, please refer to the
+[Control Brightness](https://huggingface.co/docs/diffusers/main/en/using-diffusers/control_brightness)
+section in the Hugging Face documentation.
+
+```bash
+PT_HPU_MAX_COMPOUND_OP_SIZE=1 \
+python text_to_image_generation.py \
+    --model_name_or_path ptx0/pseudo-journey-v2 \
+    --prompts "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" \
+    --num_images_per_prompt 1 \
+    --batch_size 1 \
+    --use_habana \
+    --use_hpu_graphs \
+    --image_save_dir /tmp/stable_diffusion_images_brightness \
+    --seed 33 \
+    --use_zero_snr \
+    --guidance_scale 0.7 \
+    --timestep_spacing trailing
+```
+
+### Prompt Weighting
+
+Here is an example of how to run prompt weighting. For more information, please refer to the
+[Weighted Prompts](https://huggingface.co/docs/diffusers/main/en/using-diffusers/weighted_prompts)
+section in the Hugging Face documentation.
+
+```bash
+python text_to_image_generation.py \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
+    --prompts "a red cat playing with a ball+++" "a red cat playing with a ball---" \
+    --num_images_per_prompt 4 \
+    --batch_size 4 \
+    --use_habana --use_hpu_graphs \
+    --image_save_dir /tmp/stable_diffusion_images_compel \
+    --seed 33 \
+    --sdp_on_bf16 \
+    --bf16 \
+    --num_inference_steps 20 \
+    --use_compel
+```
+
+### Controlling Image Quality
+
+Here is an example of how to improve image quality. For more details, please refer to the
+[Image Quality](https://huggingface.co/docs/diffusers/main/en/using-diffusers/image_quality)
+section in the Hugging Face documentation.
+
+```bash
+python text_to_image_generation.py \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
+    --prompts "A squirrel eating a burger" \
+    --num_images_per_prompt 4 \
+    --batch_size 4 \
+    --use_habana \
+    --image_save_dir /tmp/stable_diffusion_images_freeu \
+    --seed 33 \
+    --use_freeu \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
+## Image-to-Image Generation
 
-### Multiple Prompts
+Images can also be generated using initial input images to guide the diffusion-based image generation process.
 
-Here is how to generate images with several prompts and one image.
+### Stable Diffusion-based Image-to-Image
+
+Here is how to generate images using a single prompt and an input image with the `timbrooks/instruct-pix2pix` model, which is based on Stable Diffusion:
 
 ```bash
 python image_to_image_generation.py \
     --model_name_or_path "timbrooks/instruct-pix2pix" \
     --src_image_path "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/main/imgs/example.jpg" \
-    --prompts "turn him into cyborg" "a strong soldier"\
+    --prompts "turn him into cyborg" \
     --num_images_per_prompt 20 \
     --batch_size 4 \
     --guidance_scale 7.5 \
@@ -615,16 +652,18 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
+> [!NOTE]
 > HPU graphs are recommended when generating images by batches to get the fastest possible generations.
 > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
 > You can enable this mode with `--use_hpu_graphs`.
 
 ### Stable Diffusion XL Refiner
 
-Here is how to generate SDXL images with a single prompt and one image:
+Here is how to refine SDXL images using a single image and prompt:
 
 ```bash
 python image_to_image_generation.py \
@@ -639,20 +678,21 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-### FLUX.1 Image to Image
+### FLUX.1 Image-to-Image
 
-Here is how to generate FLUX.1 images with a single prompt and one input image:
+Here is how to generate a FLUX.1 image using a single input image and prompt:
 
 ```bash
 python image_to_image_generation.py \
     --model_name_or_path "black-forest-labs/FLUX.1-dev" \
     --src_image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" \
     --prompts "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" \
-    --num_images_per_prompt 40 \
-    --batch_size 10 \
+    --num_images_per_prompt 10 \
+    --batch_size 1 \
     --strength 0.9 \
     --guidance_scale 3.5 \
     --num_inference_steps 30 \
@@ -660,12 +700,13 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
 ### Stable Diffusion Image Variations
 
-Here is how to generate images with one image, it does not accept prompt input
+Here is how to generate image variations of a single image (without any input prompts):
 
 ```bash
 python image_to_image_generation.py \
@@ -678,12 +719,13 @@ python image_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
 ### Depth to Image Generation
 
-Here is how to generate a depth2img-guided image generation using HPU graphs with BF16:
+Here is an example of performing depth-guided image generation:
 
 ```bash
 python depth_to_image_generation.py \
@@ -693,88 +735,24 @@ python depth_to_image_generation.py \
     --image_save_dir /tmp/stable_diffusion_images \
     --use_habana \
     --use_hpu_graphs \
+    --sdp_on_bf16 \
     --bf16
 ```
 
-## Unconditional Image Generation Example
-
-Here is how to perform unconditional-image-generation on Gaudi/HPU.
-
-Original unconditional image generation pipeline is shared in here: [Unconditional Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/unconditional_image_generation)
-
-```bash
-python unconditional_image_generation.py \
-    --model_name_or_path "google/ddpm-ema-celebahq-256" \
-    --batch_size 16 \
-    --use_habana \
-    --use_gaudi_ddim_scheduler \
-    --use_hpu_graphs \
-    --bf16 \
-    --save_outputs \
-    --output_dir "/tmp/"
-```
-
-## Additional inference techniques
-
-Here is how to run the diffusers examples of inference techniques. For more details,
-please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/overview_techniques).
+## Text-to-Video Generation
 
-### Controlling brightness
-
-Here is how to run the example of controlling brightness. For more details,
-please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/control_brightness).
+This section demonstrates how to use the `GaudiTextToVideoSDPipeline` for text-to-video generation tasks on HPUs.
+The pipeline employs a UNet3D structure and generates videos through an iterative denoising process.
 
 ```bash
-python text_to_image_generation.py \
-    --model_name_or_path ptx0/pseudo-journey-v2 \
-    --prompts "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" \
-    --num_images_per_prompt 1 \
-    --batch_size 1 \
+python text_to_video_generation.py \
+    --model_name_or_path ali-vilab/text-to-video-ms-1.7b \
+    --prompts "An astronaut riding a horse" \
     --use_habana \
     --use_hpu_graphs \
-    --image_save_dir /tmp/stable_diffusion_images_brightness \
-    --seed 33 \
-    --use_zero_snr \
-    --guidance_scale 0.7 \
-    --timestep_spacing trailing
+    --dtype bf16
 ```
 
-### Prompt weighting
-
-Here is how to run the example of prompt weighting. For more details,
-please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/weighted_prompts).
-
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path CompVis/stable-diffusion-v1-4 \
-    --prompts "a red cat playing with a ball+++" "a red cat playing with a ball---" \
-    --num_images_per_prompt 4 \
-    --batch_size 4 \
-    --use_habana --use_hpu_graphs \
-    --image_save_dir /tmp/stable_diffusion_images_compel \
-    --seed 33 \
-    --bf16 \
-    --num_inference_steps 20 \
-    --use_compel
-```
-
-### Controlling image quality
-
-Here is how to run the example of improving image quality. For more details,
-please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/image_quality).
-
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path CompVis/stable-diffusion-v1-4 \
-    --prompts "A squirrel eating a burger" \
-    --num_images_per_prompt 4 \
-    --batch_size 4 \
-    --use_habana \
-    --image_save_dir /tmp/stable_diffusion_images_freeu \
-    --seed 33 \
-    --use_freeu \
-    --bf16
-```
 # Stable Video Diffusion Examples
 
 Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcement](https://stability.ai/news/stable-video-diffusion-open-ai-video-model)
@@ -799,9 +777,11 @@ python image_to_video_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
+> [!NOTE]
 > For improved performance of the image-to-video pipeline on Gaudi, it is recommended to configure the environment
 > by setting PT_HPU_MAX_COMPOUND_OP_SIZE to 1.
 
@@ -813,53 +793,65 @@ Here is how to generate videos with several image prompts:
 PT_HPU_MAX_COMPOUND_OP_SIZE=1 \
 python image_to_video_generation.py \
     --model_name_or_path "stabilityai/stable-video-diffusion-img2vid-xt" \
-    --image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png" \
-                 "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" \
-                 "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" \
-                 "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" \
+    --image_path \
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png" \
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" \
+        "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" \
+        "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" \
     --num_videos_per_prompt 1 \
     --video_save_dir /tmp/stable_video_diffusion_xt \
     --save_frames_as_images \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
+> [!NOTE]
 > For improved performance of the image-to-video pipeline on Gaudi, it is recommended to configure the environment
 > by setting PT_HPU_MAX_COMPOUND_OP_SIZE to 1.
 
-### Image-to-video ControlNet
+### Image-to-Video ControlNet
 
 Here is how to generate video conditioned by depth:
 
-```
+```bash
 python image_to_video_generation.py \
     --model_name_or_path "stabilityai/stable-video-diffusion-img2vid" \
     --controlnet_model_name_or_path "CiaraRowles/temporal-controlnet-depth-svd-v1" \
-    --control_image_path "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_0.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_1.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_2.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_3.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_4.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_5.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_6.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_7.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_8.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_9.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_10.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_11.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_12.png?raw=true" \
-             "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_13.png?raw=true" \
+    --control_image_path \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_0.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_1.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_2.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_3.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_4.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_5.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_6.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_7.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_8.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_9.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_10.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_11.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_12.png?raw=true" \
+        "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_13.png?raw=true" \
     --image_path "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/chair.png?raw=true" \
     --video_save_dir SVD_controlnet \
     --save_frames_as_images \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16 \
     --num_frames 14 \
     --motion_bucket_id=14 \
     --width=512 \
     --height=512
 ```
+
+# Important Notes for Gaudi3 Users  
+
+- **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced.
+   This issue is expected to be resolved in a future release.
+
+- **Image-to-Video ControlNet**: The Image-to-Video ControlNet command is currently not supported on Gaudi3.
diff --git a/examples/stable-diffusion/depth_to_image_generation.py b/examples/stable-diffusion/depth_to_image_generation.py
index 570a39b2c3..c32d61a05b 100755
--- a/examples/stable-diffusion/depth_to_image_generation.py
+++ b/examples/stable-diffusion/depth_to_image_generation.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
@@ -172,6 +172,12 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--throughput_warmup_steps",
         type=int,
@@ -223,6 +229,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": args.gaudi_config_name,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     if args.bf16:
diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py
index a9f2f81930..68e29c97bd 100755
--- a/examples/stable-diffusion/image_to_image_generation.py
+++ b/examples/stable-diffusion/image_to_image_generation.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
@@ -193,6 +193,12 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--ldm3d", action="store_true", help="Use LDM3D to generate an image and a depth map from a given text prompt."
     )
@@ -217,10 +223,10 @@ def main():
     args = parser.parse_args()
 
     # Set image resolution
-    res = {}
+    kwargs_call = {}
     if args.width > 0 and args.height > 0:
-        res["width"] = args.width
-        res["height"] = args.height
+        kwargs_call["width"] = args.width
+        kwargs_call["height"] = args.height
     sdxl_models = ["stable-diffusion-xl", "sdxl"]
     sdxl = False
     flux_models = ["FLUX.1"]
@@ -230,6 +236,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": args.gaudi_config_name,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     # Import selected pipeline
@@ -245,7 +252,7 @@ def main():
         from optimum.habana.diffusers import GaudiStableDiffusionInstructPix2PixPipeline as Img2ImgPipeline
 
         kwargs["safety_checker"] = None
-        res["image_guidance_scale"] = args.image_guidance_scale
+        kwargs_call["image_guidance_scale"] = args.image_guidance_scale
     elif "image-variations" in args.model_name_or_path:
         from optimum.habana.diffusers import GaudiStableDiffusionImageVariationPipeline as Img2ImgPipeline
 
@@ -284,7 +291,7 @@ def main():
         kwargs["torch_dtype"] = torch.bfloat16
 
     if args.throughput_warmup_steps is not None:
-        kwargs["throughput_warmup_steps"] = args.throughput_warmup_steps
+        kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps
 
     pipeline = Img2ImgPipeline.from_pretrained(
         args.model_name_or_path,
@@ -318,7 +325,7 @@ def main():
             output_type=args.output_type,
             profiling_warmup_steps=args.profiling_warmup_steps,
             profiling_steps=args.profiling_steps,
-            **res,
+            **kwargs_call,
         )
     elif flux:
         outputs = pipeline(
@@ -333,7 +340,7 @@ def main():
             output_type=args.output_type,
             profiling_warmup_steps=args.profiling_warmup_steps,
             profiling_steps=args.profiling_steps,
-            **res,
+            **kwargs_call,
         )
     else:
         outputs = pipeline(
@@ -348,7 +355,7 @@ def main():
             output_type=args.output_type,
             profiling_warmup_steps=args.profiling_warmup_steps,
             profiling_steps=args.profiling_steps,
-            **res,
+            **kwargs_call,
         )
 
     # Save the pipeline in the specified directory if not None
@@ -363,12 +370,12 @@ def main():
             logger.info(f"Saving images in {image_save_dir.resolve()}...")
             if args.ldm3d:
                 for i, rgb in enumerate(outputs.rgb):
-                    rgb.save(image_save_dir / f"rgb_{i+1}.png")
+                    rgb.save(image_save_dir / f"rgb_{i + 1}.png")
                 for i, depth in enumerate(outputs.depth):
-                    depth.save(image_save_dir / f"depth_{i+1}.png")
+                    depth.save(image_save_dir / f"depth_{i + 1}.png")
             else:
                 for i, image in enumerate(outputs.images):
-                    image.save(image_save_dir / f"image_{i+1}.png")
+                    image.save(image_save_dir / f"image_{i + 1}.png")
         else:
             logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")
 
diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py
index 4112a1b39c..bf5cdb5459 100755
--- a/examples/stable-diffusion/image_to_video_generation.py
+++ b/examples/stable-diffusion/image_to_video_generation.py
@@ -34,7 +34,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
@@ -177,7 +177,31 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument("--num_frames", type=int, default=25, help="The number of video frames to generate.")
+    parser.add_argument(
+        "--profiling_warmup_steps",
+        default=0,
+        type=int,
+        help="Number of steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps",
+        default=0,
+        type=int,
+        help="Number of steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--throughput_warmup_steps",
+        type=int,
+        default=None,
+        help="Number of steps to ignore for throughput calculation.",
+    )
     args = parser.parse_args()
 
     # Setup logging
@@ -218,6 +242,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": args.gaudi_config_name,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     set_seed(args.seed)
@@ -261,6 +286,9 @@ def main():
             args.model_name_or_path,
             **kwargs,
         )
+        kwargs_call = {}
+        if args.throughput_warmup_steps is not None:
+            kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps
 
         # Generate images
         outputs = pipeline(
@@ -277,6 +305,9 @@ def main():
             noise_aug_strength=args.noise_aug_strength,
             decode_chunk_size=args.decode_chunk_size,
             output_type=args.output_type,
+            profiling_warmup_steps=args.profiling_warmup_steps,
+            profiling_steps=args.profiling_steps,
+            **kwargs_call,
         )
 
     # Save the pipeline in the specified directory if not None
diff --git a/examples/stable-diffusion/quantization/flux/measure_config.json b/examples/stable-diffusion/quantization/flux/measure_config.json
index 865078d99f..f90605dba8 100644
--- a/examples/stable-diffusion/quantization/flux/measure_config.json
+++ b/examples/stable-diffusion/quantization/flux/measure_config.json
@@ -1,5 +1,5 @@
 {
     "method": "HOOKS",
     "mode": "MEASURE",
-    "dump_stats_path": "quantization/flux/measure_all/fp8"
+    "dump_stats_path": "quantization/flux/measure/fp8"
 }
diff --git a/examples/stable-diffusion/quantization/flux/quantize_config.json b/examples/stable-diffusion/quantization/flux/quantize_config.json
index 8fdb21fccf..e601db4ba4 100644
--- a/examples/stable-diffusion/quantization/flux/quantize_config.json
+++ b/examples/stable-diffusion/quantization/flux/quantize_config.json
@@ -2,5 +2,5 @@
     "method": "HOOKS",
     "mode": "QUANTIZE",
     "scale_method": "maxabs_hw_opt_weight",
-    "dump_stats_path": "quantization/flux/measure_all/fp8"
+    "dump_stats_path": "quantization/flux/measure/fp8"
 }
diff --git a/examples/stable-diffusion/quantization/stable-diffusion-3/measure_config.json b/examples/stable-diffusion/quantization/stable-diffusion-3/measure_config.json
new file mode 100644
index 0000000000..ebf3baa292
--- /dev/null
+++ b/examples/stable-diffusion/quantization/stable-diffusion-3/measure_config.json
@@ -0,0 +1,5 @@
+{
+    "method": "HOOKS",
+    "mode": "MEASURE",
+    "dump_stats_path": "quantization/stable-diffusion-3/measure_all/fp8"
+}
\ No newline at end of file
diff --git a/examples/stable-diffusion/quantization/stable-diffusion-3/quantize_config.json b/examples/stable-diffusion/quantization/stable-diffusion-3/quantize_config.json
new file mode 100644
index 0000000000..1fa98ebce0
--- /dev/null
+++ b/examples/stable-diffusion/quantization/stable-diffusion-3/quantize_config.json
@@ -0,0 +1,6 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "scale_method": "maxabs_hw_opt_weight",
+    "dump_stats_path": "quantization/stable-diffusion-3/measure_all/fp8"
+}
\ No newline at end of file
diff --git a/examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.json b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.json
similarity index 100%
rename from examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.json
rename to examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.json
diff --git a/examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.npz b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.npz
similarity index 100%
rename from examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.npz
rename to examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.npz
diff --git a/examples/stable-diffusion/quantization/measure_config.json b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure_config.json
old mode 100755
new mode 100644
similarity index 52%
rename from examples/stable-diffusion/quantization/measure_config.json
rename to examples/stable-diffusion/quantization/stable-diffusion-xl/measure_config.json
index 04576eeb46..5a250cad7c
--- a/examples/stable-diffusion/quantization/measure_config.json
+++ b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure_config.json
@@ -2,5 +2,5 @@
     "method": "HOOKS",
     "mode": "MEASURE",
     "observer": "maxabs",
-    "dump_stats_path": "./quantization/measure/fp8"
+    "dump_stats_path": "quantization/stable-diffusion-xl/measure/fp8"
 }
diff --git a/examples/stable-diffusion/quantization/quant_config.json b/examples/stable-diffusion/quantization/stable-diffusion-xl/quantize_config.json
old mode 100755
new mode 100644
similarity index 60%
rename from examples/stable-diffusion/quantization/quant_config.json
rename to examples/stable-diffusion/quantization/stable-diffusion-xl/quantize_config.json
index b372905d7f..5d686e659d
--- a/examples/stable-diffusion/quantization/quant_config.json
+++ b/examples/stable-diffusion/quantization/stable-diffusion-xl/quantize_config.json
@@ -3,5 +3,5 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "maxabs_hw",
-    "dump_stats_path": "./quantization/measure/fp8"
-}
\ No newline at end of file
+    "dump_stats_path": "quantization/stable-diffusion-xl/measure/fp8"
+}
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
index 77804275cf..b32fc5c3f6 100755
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ b/examples/stable-diffusion/text_to_image_generation.py
@@ -42,7 +42,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
@@ -305,6 +305,12 @@ def main():
         default=None,
         help="The file with prompts (for large number of images generation).",
     )
+    parser.add_argument(
+        "--lora_scale",
+        type=float,
+        default=None,
+        help="A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.",
+    )
     args = parser.parse_args()
 
     if args.optimize and not args.use_habana:
@@ -380,6 +386,9 @@ def main():
     if args.throughput_warmup_steps is not None:
         kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps
 
+    if args.lora_scale is not None:
+        kwargs_call["lora_scale"] = args.lora_scale
+
     negative_prompts = args.negative_prompts
     if args.distributed:
         distributed_state = PartialState()
@@ -441,8 +450,7 @@ def main():
     kwargs_call["quant_mode"] = args.quant_mode
 
     # Instantiate a Stable Diffusion pipeline class
-    import habana_frameworks.torch.core as htcore  # noqa: F401
-
+    quant_config_path = os.getenv("QUANT_CONFIG")
     if sdxl:
         # SDXL pipelines
         if controlnet:
@@ -473,7 +481,6 @@ def main():
             pipeline.unet.set_default_attn_processor(pipeline.unet)
             pipeline.to(torch.device("hpu"))
 
-            quant_config_path = os.getenv("QUANT_CONFIG")
             if quant_config_path:
                 import habana_frameworks.torch.core as htcore
                 from neural_compressor.torch.quantization import FP8Config, convert, prepare
@@ -685,12 +692,12 @@ def main():
             logger.info(f"Saving images in {image_save_dir.resolve()}...")
             if args.ldm3d:
                 for i, rgb in enumerate(outputs.rgb):
-                    rgb.save(image_save_dir / f"rgb_{i+1}.png")
+                    rgb.save(image_save_dir / f"rgb_{i + 1}.png")
                 for i, depth in enumerate(outputs.depth):
-                    depth.save(image_save_dir / f"depth_{i+1}.png")
+                    depth.save(image_save_dir / f"depth_{i + 1}.png")
             else:
                 for i, image in enumerate(outputs.images):
-                    image.save(image_save_dir / f"image_{i+1}.png")
+                    image.save(image_save_dir / f"image_{i + 1}.png")
         else:
             logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")
 
diff --git a/examples/text-to-video/text_to_video_generation.py b/examples/stable-diffusion/text_to_video_generation.py
similarity index 99%
rename from examples/text-to-video/text_to_video_generation.py
rename to examples/stable-diffusion/text_to_video_generation.py
index 4a91359617..8813e321cf 100755
--- a/examples/text-to-video/text_to_video_generation.py
+++ b/examples/stable-diffusion/text_to_video_generation.py
@@ -37,7 +37,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/training/README.md b/examples/stable-diffusion/training/README.md
index a10c194066..4ea85c9e36 100644
--- a/examples/stable-diffusion/training/README.md
+++ b/examples/stable-diffusion/training/README.md
@@ -18,61 +18,71 @@ limitations under the License.
 
 This directory contains scripts that showcase how to perform training/fine-tuning of Stable Diffusion models on Habana Gaudi.
 
-
 ## Textual Inversion
 
 [Textual Inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like Stable Diffusion on your own images using just 3-5 examples.
-The `textual_inversion.py` script shows how to implement the training procedure on Habana Gaudi.
-
 
-### Cat Toy Example
+The `textual_inversion.py` script shows how to implement the training procedure on Habana Gaudi.
 
 In the examples below, we will use a set of cat images from the following dataset:
 [https://huggingface.co/datasets/diffusers/cat_toy_example](https://huggingface.co/datasets/diffusers/cat_toy_example)
 
-Let's first download this dataset locally:
-
-```python
-from huggingface_hub import snapshot_download
-from pathlib import Path
-import shutil
-
-local_dir = './cat'
-snapshot_download(
-    'diffusers/cat_toy_example',
-    local_dir=local_dir,
-    repo_type='dataset',
-    ignore_patterns='.gitattributes',
-)
-cache_dir = Path(local_dir, '.cache')
-if cache_dir.is_dir():
-    shutil.rmtree(cache_dir)
+To download this and other example training datasets locally, run:
+```bash
+python download_train_datasets.py
 ```
 
-This will be our training data.
 Now we can launch the training using:
 
 ```bash
 python textual_inversion.py \
-  --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
-  --train_data_dir ./cat \
-  --learnable_property object \
-  --placeholder_token "<cat-toy>" \
-  --initializer_token toy \
-  --resolution 512 \
-  --train_batch_size 4 \
-  --max_train_steps 3000 \
-  --learning_rate 5.0e-04 \
-  --scale_lr \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir /tmp/textual_inversion_cat \
-  --save_as_full_pipeline \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3
+    --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
+    --train_data_dir ./cat \
+    --learnable_property object \
+    --placeholder_token "<cat-toy>" \
+    --initializer_token toy \
+    --resolution 512 \
+    --train_batch_size 4 \
+    --max_train_steps 3000 \
+    --learning_rate 5.0e-04 \
+    --scale_lr \
+    --lr_scheduler constant \
+    --lr_warmup_steps 0 \
+    --output_dir /tmp/textual_inversion_cat \
+    --save_as_full_pipeline \
+    --gaudi_config_name Habana/stable-diffusion \
+    --throughput_warmup_steps 3
+```
+
+> [!NOTE]
+> Change `--resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
+
+> [!NOTE]
+> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, *e.g.* `"<cat-toy>"`.
+> However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable parameters.
+> This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to a number larger than one,
+> *e.g.*: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case.
+
+Once you have trained a model as described above, inference can be done using `GaudiStableDiffusionPipeline`.
+Please make sure to include the `placeholder_token` in your prompt so that textual inversion guided inference can take effect.
+
+You can use `text_to_image_generation.py` sample to run inference with the fine-tuned model:
+
+```bash
+python ../text_to_image_generation.py \
+    --model_name_or_path /tmp/textual_inversion_cat \
+    --prompts "A <cat-toy> backpack" \
+    --num_images_per_prompt 5 \
+    --batch_size 1 \
+    --image_save_dir /tmp/textual_inversion_cat_images \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16
 ```
 
-The following example shows how to run inference using the fine-tuned model:
+Alternatively, you can run inference with the fine-tuned model using a simple Python script like this:
 
 ```python
 from optimum.habana.diffusers import GaudiStableDiffusionPipeline
@@ -85,6 +95,7 @@ pipe = GaudiStableDiffusionPipeline.from_pretrained(
     use_habana=True,
     use_hpu_graphs=True,
     gaudi_config="Habana/stable-diffusion",
+    sdp_on_bf16=True,
 )
 
 prompt = "A <cat-toy> backpack"
@@ -92,14 +103,6 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
 image.save(f"cat-backpack.png")
 ```
 
-> Change `--resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
-
-> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, *e.g.* `"<cat-toy>"`.
-> However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable parameters.
-> This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to a number larger than one,
-> *e.g.*: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case.
-
-
 ## Textual Inversion XL
 
 The `textual_inversion_sdxl.py` script shows how to implement textual inversion fine-tuning on Gaudi for XL diffusion models
@@ -109,32 +112,52 @@ Assuming the afforemenioned cat toy dataset has been obtained, we can launch tex
 
 ```bash
 python textual_inversion_sdxl.py \
-  --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-  --train_data_dir ./cat \
-  --learnable_property object \
-  --placeholder_token "<cat-toy>" \
-  --initializer_token toy \
-  --resolution 768 \
-  --train_batch_size 1 \
-  --gradient_accumulation_steps 4 \
-  --max_train_steps 500 \
-  --learning_rate 5.0e-04 \
-  --scale_lr \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir /tmp/textual_inversion_cat_sdxl \
-  --save_as_full_pipeline \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3
+    --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
+    --train_data_dir ./cat \
+    --learnable_property object \
+    --placeholder_token "<cat-toy>" \
+    --initializer_token toy \
+    --resolution 768 \
+    --train_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --max_train_steps 500 \
+    --learning_rate 5.0e-04 \
+    --scale_lr \
+    --lr_scheduler constant \
+    --lr_warmup_steps 0 \
+    --output_dir /tmp/textual_inversion_cat_sdxl \
+    --save_as_full_pipeline \
+    --gaudi_config_name Habana/stable-diffusion \
+    --throughput_warmup_steps 3
 ```
 
-> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, *e.g.* `"<cat-toy>"`.
-> However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable parameters.
-> This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to a number larger than one,
-> *e.g.*: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case.
+> [!NOTE]
+> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token,
+> e.g. `"<cat-toy>"`. However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable
+> parameters. This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to
+> a number larger than one, e.g.: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case.
 
 The script also supports training of both text encoders of SDXL, so inference can be executed by inserting a placeholder token into one or both prompts.
-The following example shows how to run inference using the fine tuned-model with both text encoders, separately and in combination:
+
+For example, after training you can use `text_to_image_generation.py` sample to run inference with the fine-tuned model as follows:
+
+```bash
+python ../text_to_image_generation.py \
+    --model_name_or_path /tmp/textual_inversion_cat_sdxl \
+    --prompts "A <cat-toy> backpack" \
+    --num_images_per_prompt 5 \
+    --batch_size 1 \
+    --image_save_dir /tmp/textual_inversion_cat_sdxl_images \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16
+```
+
+Alternatively, you can run inference with the fine-tuned model using a simple standalone Python script.
+The following script can be used to run inference using the fine-tuned model with both text encoders,
+separately and in combination:
 
 ```python
 from optimum.habana.diffusers import GaudiStableDiffusionXLPipeline
@@ -147,6 +170,7 @@ pipe = GaudiStableDiffusionXLPipeline.from_pretrained(
     use_habana=True,
     use_hpu_graphs=True,
     gaudi_config="Habana/stable-diffusion",
+    sdp_on_bf16=True,
 )
 
 prompt = "A <cat-toy> backpack"
@@ -161,71 +185,77 @@ image = pipe(prompt=prompt, prompt_2=prompt_2, num_inference_steps=50, guidance_
 image.save(f"cat-backpack_p1and2.png")
 ```
 
-> [!NOTE]
-> Change `--resolution` to 768 if you are using [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
-
-> [!NOTE]
-> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token,
-> e.g. `"<cat-toy>"`. However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable
-> parameters. This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to
-> a number larger than one, e.g.: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case.
-
-
 ## ControlNet Training
 
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models ](https://huggingface.co/papers/2302.05543)
 by Lvmin Zhang and Maneesh Agrawala. It is a type of model for controlling StableDiffusion by conditioning the model with an additional input image.
 This example is adapted from [controlnet example in the diffusers repository](https://github.com/huggingface/diffusers/tree/main/examples/controlnet#training).
 
-First, download the conditioning images as shown below:
-
+To download the example conditioning images locally, run:
 ```bash
-wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
-wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
+python download_train_datasets.py
 ```
 
 Then proceed to training with command:
 
 ```bash
 python train_controlnet.py \
- --pretrained_model_name_or_path=CompVis/stable-diffusion-v1-4\
- --output_dir=/tmp/stable_diffusion1_5 \
- --dataset_name=fusing/fill50k \
- --resolution=512 \
- --learning_rate=1e-5 \
- --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
- --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
- --train_batch_size=4 \
- --throughput_warmup_steps=3 \
- --use_hpu_graphs \
- --bf16 \
- --trust_remote_code
+   --pretrained_model_name_or_path=CompVis/stable-diffusion-v1-4\
+   --output_dir=/tmp/stable_diffusion1_4 \
+   --dataset_name=fusing/fill50k \
+   --resolution=512 \
+   --learning_rate=1e-5 \
+   --validation_image "./cnet/conditioning_image_1.png" "./cnet/conditioning_image_2.png" \
+   --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+   --train_batch_size=4 \
+   --throughput_warmup_steps=3 \
+   --use_hpu_graphs \
+   --sdp_on_bf16 \
+   --bf16 \
+   --trust_remote_code
 ```
 
-### Multi-card Run
+### Multi-Card Training
 
 You can run these fine-tuning scripts in a distributed fashion as follows:
 ```bash
 python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \
-  --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
-  --output_dir=/tmp/stable_diffusion1_5 \
-  --dataset_name=fusing/fill50k \
-  --resolution=512 \
-  --learning_rate=1e-5 \
-  --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
-  --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
-  --train_batch_size=4 \
-  --throughput_warmup_steps 3 \
-  --use_hpu_graphs \
-  --bf16 \
-  --trust_remote_code
+    --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
+    --output_dir=/tmp/stable_diffusion1_4 \
+    --dataset_name=fusing/fill50k \
+    --resolution=512 \
+    --learning_rate=1e-5 \
+    --validation_image "./cnet/conditioning_image_1.png" "./cnet/conditioning_image_2.png" \
+    --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+    --train_batch_size=4 \
+    --throughput_warmup_steps 3 \
+    --use_hpu_graphs \
+    --sdp_on_bf16 \
+    --bf16 \
+    --trust_remote_code
 ```
 
-
 ### Inference
 
-Once you have trained a model as described right above, inference can be done simply using the `GaudiStableDiffusionPipeline`.
-Make sure to include the `placeholder_token` in your prompt.
+After training completes, you can use `text_to_image_generation.py` sample to run inference with the fine-tuned ControlNet model:
+
+```bash
+python ../text_to_image_generation.py \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
+    --controlnet_model_name_or_path /tmp/stable_diffusion1_4 \
+    --prompts "pale golden rod circle with old lace background" \
+    --control_image "./cnet/conditioning_image_1.png" \
+    --num_images_per_prompt 5 \
+    --batch_size 1 \
+    --image_save_dir /tmp/controlnet_images \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16
+```
+
+Alternatively, you can run inference using a simple standalone Python script, as shown below:
 
 ```python
 from diffusers import ControlNetModel, UniPCMultistepScheduler
@@ -234,7 +264,7 @@ import torch
 from optimum.habana.diffusers import GaudiStableDiffusionControlNetPipeline
 
 base_model_path = "CompVis/stable-diffusion-v1-4"
-controlnet_path = "/tmp/stable_diffusion1_5"
+controlnet_path = "/tmp/stable_diffusion1_4"
 
 controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.bfloat16)
 pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained(
@@ -244,12 +274,13 @@ pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained(
     use_habana=True,
     use_hpu_graphs=True,
     gaudi_config="Habana/stable-diffusion",
+    sdp_on_bf16=True,
 )
 
 # speed up diffusion process with faster scheduler and memory optimization
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
 
-control_image = load_image("./conditioning_image_1.png")
+control_image = load_image("./cnet/conditioning_image_1.png")
 prompt = "pale golden rod circle with old lace background"
 
 # generate image
@@ -260,7 +291,6 @@ image = pipe(
 image.save("./output.png")
 ```
 
-
 ## Fine-Tuning for Stable Diffusion XL
 
 The `train_text_to_image_sdxl.py` script shows how to implement the fine-tuning of Stable Diffusion XL models on Gaudi.
@@ -272,100 +302,102 @@ Install the requirements:
 pip install -r requirements.txt
 ```
 
-### Single-card Training
+### Single Card Training
 
 To train Stable Diffusion XL on a single Gaudi card, use:
 ```bash
 python train_text_to_image_sdxl.py \
-  --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-  --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \
-  --dataset_name lambdalabs/naruto-blip-captions \
-  --resolution 512 \
-  --crop_resolution 512 \
-  --center_crop \
-  --random_flip \
-  --proportion_empty_prompts=0.2 \
-  --train_batch_size 16 \
-  --max_train_steps 2500 \
-  --learning_rate 1e-05 \
-  --max_grad_norm 1 \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir sdxl_model_output \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3 \
-  --dataloader_num_workers 8 \
-  --bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --validation_prompt="a cute naruto creature" \
-  --validation_epochs 48 \
-  --checkpointing_steps 2500 \
-  --logging_step 10 \
-  --adjust_throughput
+    --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
+    --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \
+    --dataset_name lambdalabs/naruto-blip-captions \
+    --resolution 512 \
+    --crop_resolution 512 \
+    --center_crop \
+    --random_flip \
+    --proportion_empty_prompts=0.2 \
+    --train_batch_size 16 \
+    --max_train_steps 2500 \
+    --learning_rate 1e-05 \
+    --max_grad_norm 1 \
+    --lr_scheduler constant \
+    --lr_warmup_steps 0 \
+    --output_dir sdxl_model_output \
+    --gaudi_config_name Habana/stable-diffusion \
+    --throughput_warmup_steps 3 \
+    --dataloader_num_workers 8 \
+    --sdp_on_bf16 \
+    --bf16 \
+    --use_hpu_graphs_for_training \
+    --use_hpu_graphs_for_inference \
+    --validation_prompt="a cute naruto creature" \
+    --validation_epochs 48 \
+    --checkpointing_steps 2500 \
+    --logging_step 10 \
+    --adjust_throughput
 ```
 
-
 ### Multi-Card Training
 
 To train Stable Diffusion XL on a multi-card Gaudi system, use:
 ```bash
 PT_HPU_RECIPE_CACHE_CONFIG=/tmp/stdxl_recipe_cache,True,1024  \
 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_text_to_image_sdxl.py \
-  --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-  --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \
-  --dataset_name lambdalabs/naruto-blip-captions \
-  --resolution 512 \
-  --crop_resolution 512 \
-  --center_crop \
-  --random_flip \
-  --proportion_empty_prompts=0.2 \
-  --train_batch_size 16 \
-  --max_train_steps 336 \
-  --learning_rate 1e-05 \
-  --max_grad_norm 1 \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir sdxl_model_output \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3 \
-  --dataloader_num_workers 8 \
-  --bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --validation_prompt="a cute naruto creature" \
-  --validation_epochs 48 \
-  --checkpointing_steps 336 \
-  --mediapipe dataset_sdxl_mediapipe \
-  --adjust_throughput
+    --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
+    --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \
+    --dataset_name lambdalabs/naruto-blip-captions \
+    --resolution 512 \
+    --crop_resolution 512 \
+    --center_crop \
+    --random_flip \
+    --proportion_empty_prompts=0.2 \
+    --train_batch_size 16 \
+    --max_train_steps 336 \
+    --learning_rate 1e-05 \
+    --max_grad_norm 1 \
+    --lr_scheduler constant \
+    --lr_warmup_steps 0 \
+    --output_dir sdxl_model_output \
+    --gaudi_config_name Habana/stable-diffusion \
+    --throughput_warmup_steps 3 \
+    --dataloader_num_workers 8 \
+    --sdp_on_bf16 \
+    --bf16 \
+    --use_hpu_graphs_for_training \
+    --use_hpu_graphs_for_inference \
+    --validation_prompt="a cute naruto creature" \
+    --validation_epochs 48 \
+    --checkpointing_steps 336 \
+    --mediapipe dataset_sdxl_mediapipe \
+    --adjust_throughput
 ```
 
-### Single-Card Training on Gaudi1
+### Single Card Training on Gaudi1
 
 To train Stable Diffusion XL on a single Gaudi1 card, use:
 ```bash
 python train_text_to_image_sdxl.py \
-  --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-  --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \
-  --dataset_name lambdalabs/naruto-blip-captions \
-  --resolution 256 \
-  --center_crop \
-  --random_flip \
-  --proportion_empty_prompts=0.2 \
-  --train_batch_size 1 \
-  --gradient_accumulation_steps 4 \
-  --max_train_steps 3000 \
-  --learning_rate 1e-05 \
-  --max_grad_norm 1 \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir sdxl_model_output \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --checkpointing_steps 3000 \
-  --bf16
+    --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
+    --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \
+    --dataset_name lambdalabs/naruto-blip-captions \
+    --resolution 256 \
+    --center_crop \
+    --random_flip \
+    --proportion_empty_prompts=0.2 \
+    --train_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --max_train_steps 3000 \
+    --learning_rate 1e-05 \
+    --max_grad_norm 1 \
+    --lr_scheduler constant \
+    --lr_warmup_steps 0 \
+    --output_dir sdxl_model_output \
+    --gaudi_config_name Habana/stable-diffusion \
+    --throughput_warmup_steps 3 \
+    --use_hpu_graphs_for_training \
+    --use_hpu_graphs_for_inference \
+    --checkpointing_steps 3000 \
+    --sdp_on_bf16 \
+    --bf16
 ```
 
 > [!NOTE]
@@ -375,6 +407,24 @@ python train_text_to_image_sdxl.py \
 > [!NOTE]
 > `--mediapipe` only works on Gaudi2.
 
+### Inference
+
+After training is finished, you can run inference using `text_to_image_generation.py` script as follows:
+
+```bash
+python ../text_to_image_generation.py \
+    --model_name_or_path sdxl_model_output \
+    --prompts "a cute naruto creature" \
+    --num_images_per_prompt 5 \
+    --batch_size 1 \
+    --image_save_dir /tmp/stable_diffusion_xl_images \
+    --scheduler euler_discrete \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16
+```
 
 ## DreamBooth
 
@@ -382,28 +432,12 @@ DreamBooth is a technique for personalizing text-to-image models like Stable Dif
 of a specific subject. The `train_dreambooth.py` script demonstrates how to implement this training process and adapt it for
 Stable Diffusion.
 
-### Dog Toy Example
-
 For DreamBooth examples we will use a set of dog images from the following dataset:
 [https://huggingface.co/datasets/diffusers/dog-example](https://huggingface.co/datasets/diffusers/dog-example).
 
-Let's first download this dataset locally:
-
-```python
-from huggingface_hub import snapshot_download
-from pathlib import Path
-import shutil
-
-local_dir = './dog'
-snapshot_download(
-    'diffusers/dog-example',
-    local_dir=local_dir,
-    repo_type='dataset',
-    ignore_patterns='.gitattributes',
-)
-cache_dir = Path(local_dir, '.cache')
-if cache_dir.is_dir():
-    shutil.rmtree(cache_dir)
+To download this and other example training datasets locally, run:
+```bash
+python download_train_datasets.py
 ```
 
 ### Full Model Fine-Tuning
@@ -411,26 +445,26 @@ if cache_dir.is_dir():
 To launch the multi-card Stable Diffusion training, use:
 ```bash
 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth.py \
-  --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4"  \
-  --instance_data_dir="dog" \
-  --output_dir="dog_sd" \
-  --class_data_dir="path-to-class-images" \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --num_class_images=200 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=800 \
-  --mixed_precision=bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --gaudi_config_name Habana/stable-diffusion \
-  full
+    --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4"  \
+    --instance_data_dir="dog" \
+    --output_dir="dog_sd" \
+    --class_data_dir="path-to-class-images" \
+    --with_prior_preservation --prior_loss_weight=1.0 \
+    --instance_prompt="a photo of sks dog" \
+    --class_prompt="a photo of dog" \
+    --resolution=512 \
+    --train_batch_size=1 \
+    --num_class_images=200 \
+    --gradient_accumulation_steps=1 \
+    --learning_rate=5e-6 \
+    --lr_scheduler="constant" \
+    --lr_warmup_steps=0 \
+    --max_train_steps=800 \
+    --mixed_precision=bf16 \
+    --use_hpu_graphs_for_training \
+    --use_hpu_graphs_for_inference \
+    --gaudi_config_name Habana/stable-diffusion \
+    full
 ```
 
 Prior preservation is used to prevent overfitting and language drift. For more details, refer to the original paper.
@@ -448,27 +482,27 @@ UNet or text encoder.
 To run the multi-card training, use:
 ```bash
 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth.py \
-  --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4"  \
-  --instance_data_dir="dog" \
-  --output_dir="dog_sd" \
-  --class_data_dir="path-to-class-images" \
-  --with_prior_preservation \
-  --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --num_class_images=200 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=1e-4 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=800 \
-  --mixed_precision=bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --gaudi_config_name Habana/stable-diffusion \
-  lora --unet_r 8 --unet_alpha 8
+    --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4"  \
+    --instance_data_dir="dog" \
+    --output_dir="dog_sd" \
+    --class_data_dir="path-to-class-images" \
+    --with_prior_preservation \
+    --prior_loss_weight=1.0 \
+    --instance_prompt="a photo of sks dog" \
+    --class_prompt="a photo of dog" \
+    --resolution=512 \
+    --train_batch_size=1 \
+    --num_class_images=200 \
+    --gradient_accumulation_steps=1 \
+    --learning_rate=1e-4 \
+    --lr_scheduler="constant" \
+    --lr_warmup_steps=0 \
+    --max_train_steps=800 \
+    --mixed_precision=bf16 \
+    --use_hpu_graphs_for_training \
+    --use_hpu_graphs_for_inference \
+    --gaudi_config_name Habana/stable-diffusion \
+    lora --unet_r 8 --unet_alpha 8
 ```
 > [!NOTE]
 > When using PEFT method we can use a much higher learning rate compared to vanilla dreambooth.
@@ -498,6 +532,7 @@ python ../text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -508,54 +543,70 @@ We can use the same `dog` dataset for the following examples.
 To launch Stable Diffusion XL LoRA training on a multi-card Gaudi system, use:"
 ```bash
 python train_dreambooth_lora_sdxl.py \
-  --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0"  \
-  --instance_data_dir="dog" \
-  --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \
-  --output_dir="lora-trained-xl" \
-  --mixed_precision="bf16" \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --learning_rate=1e-4 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=25 \
-  --seed=0 \
-  --use_hpu_graphs_for_inference \
-  --use_hpu_graphs_for_training \
-  --gaudi_config_name Habana/stable-diffusion
+    --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0"  \
+    --instance_data_dir="dog" \
+    --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \
+    --output_dir="lora-trained-xl" \
+    --mixed_precision="bf16" \
+    --instance_prompt="a photo of sks dog" \
+    --resolution=1024 \
+    --train_batch_size=1 \
+    --gradient_accumulation_steps=4 \
+    --learning_rate=1e-4 \
+    --lr_scheduler="constant" \
+    --lr_warmup_steps=0 \
+    --max_train_steps=500 \
+    --validation_prompt="A photo of sks dog in a bucket" \
+    --validation_epochs=25 \
+    --seed=0 \
+    --use_hpu_graphs_for_inference \
+    --use_hpu_graphs_for_training \
+    --gaudi_config_name Habana/stable-diffusion
 ```
 
 To launch Stable Diffusion XL LoRA training on a multi-card Gaudi system, use:"
 ```bash
 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth_lora_sdxl.py \
-  --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0"  \
-  --instance_data_dir="dog" \
-  --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \
-  --output_dir="lora-trained-xl" \
-  --mixed_precision="bf16" \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --learning_rate=1e-4 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=25 \
-  --seed=0 \
-  --use_hpu_graphs_for_inference \
-  --use_hpu_graphs_for_training \
-  --gaudi_config_name Habana/stable-diffusion
+    --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0"  \
+    --instance_data_dir="dog" \
+    --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \
+    --output_dir="lora-trained-xl" \
+    --mixed_precision="bf16" \
+    --instance_prompt="a photo of sks dog" \
+    --resolution=1024 \
+    --train_batch_size=1 \
+    --gradient_accumulation_steps=4 \
+    --learning_rate=1e-4 \
+    --lr_scheduler="constant" \
+    --lr_warmup_steps=0 \
+    --max_train_steps=500 \
+    --validation_prompt="A photo of sks dog in a bucket" \
+    --validation_epochs=25 \
+    --seed=0 \
+    --use_hpu_graphs_for_inference \
+    --use_hpu_graphs_for_training \
+    --gaudi_config_name Habana/stable-diffusion
 ```
 > [!NOTE]
 > To use DeepSpeed instead of MPI, replace `--use_mpi` with `--deepspeed` in the previous example
 
-After training completes, you can run inference with a simple python script like this:
+After training is completed, you can directly use `text_to_image_generation.py` sample for inference, as shown below:
+```bash
+python ../text_to_image_generation.py \
+    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0  \
+    --lora_id lora-trained-xl \
+    --prompts "A picture of a sks dog in a bucket" \
+    --num_images_per_prompt 5 \
+    --batch_size 1 \
+    --image_save_dir /tmp/stable_diffusion_xl_images \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16
+```
+
+Alternatively, you can run inference with a simple Python script such as this:
 ```python
 import torch
 from optimum.habana import GaudiConfig
@@ -567,6 +618,7 @@ pipe = GaudiStableDiffusionXLPipeline.from_pretrained(
     use_hpu_graphs=True,
     use_habana=True,
     gaudi_config="Habana/stable-diffusion",
+    sdp_on_bf16=True,
 )
 pipe.load_lora_weights("lora-trained-xl")
 
@@ -582,21 +634,6 @@ image = pipe(
 image.save("sdxl-lora.png")
 ```
 
-Alternatively, you could directly use `text_to_image_generation.py` sample for inference as follows:
-```bash
-python ../text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0  \
-    --lora_id lora-trained-xl \
-    --prompts "A picture of a sks dog in a bucket" \
-    --num_images_per_prompt 5 \
-    --batch_size 1 \
-    --image_save_dir /tmp/stable_diffusion_xl_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
 ### DreamBooth LoRA Fine-Tuning with FLUX.1-dev
 
 We can use the same `dog` dataset for the following examples.
@@ -604,60 +641,76 @@ We can use the same `dog` dataset for the following examples.
 To launch FLUX.1-dev LoRA training on a single Gaudi card, use:"
 ```bash
 python train_dreambooth_lora_flux.py \
-  --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
-  --dataset="dog" \
-  --prompt="a photo of sks dog" \
-  --output_dir="dog_lora_flux" \
-  --mixed_precision="bf16" \
-  --weighting_scheme="none" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --learning_rate=1e-4 \
-  --guidance_scale=1 \
-  --report_to="tensorboard" \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --cache_latents \
-  --rank=4 \
-  --max_train_steps=500 \
-  --seed="0" \
-  --use_hpu_graphs_for_inference \
-  --use_hpu_graphs_for_training \
-  --gaudi_config_name="Habana/stable-diffusion"
+    --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
+    --dataset="dog" \
+    --prompt="a photo of sks dog" \
+    --output_dir="dog_lora_flux" \
+    --mixed_precision="bf16" \
+    --weighting_scheme="none" \
+    --resolution=1024 \
+    --train_batch_size=1 \
+    --learning_rate=1e-4 \
+    --guidance_scale=1 \
+    --report_to="tensorboard" \
+    --gradient_accumulation_steps=4 \
+    --gradient_checkpointing \
+    --lr_scheduler="constant" \
+    --lr_warmup_steps=0 \
+    --cache_latents \
+    --rank=4 \
+    --max_train_steps=500 \
+    --seed="0" \
+    --use_hpu_graphs_for_inference \
+    --use_hpu_graphs_for_training \
+    --gaudi_config_name="Habana/stable-diffusion"
 ```
 
 To launch FLUX.1-dev LoRA training on a multi-card Gaudi system, use:"
 ```bash
 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth_lora_flux.py \
-  --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
-  --dataset="dog" \
-  --prompt="a photo of sks dog" \
-  --output_dir="dog_lora_flux" \
-  --mixed_precision="bf16" \
-  --weighting_scheme="none" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --learning_rate=1e-4 \
-  --guidance_scale=1 \
-  --report_to="tensorboard" \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --cache_latents \
-  --rank=4 \
-  --max_train_steps=500 \
-  --seed="0" \
-  --use_hpu_graphs_for_inference \
-  --use_hpu_graphs_for_training \
-  --gaudi_config_name="Habana/stable-diffusion"
+    --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
+    --dataset="dog" \
+    --prompt="a photo of sks dog" \
+    --output_dir="dog_lora_flux" \
+    --mixed_precision="bf16" \
+    --weighting_scheme="none" \
+    --resolution=1024 \
+    --train_batch_size=1 \
+    --learning_rate=1e-4 \
+    --guidance_scale=1 \
+    --report_to="tensorboard" \
+    --gradient_accumulation_steps=4 \
+    --gradient_checkpointing \
+    --lr_scheduler="constant" \
+    --lr_warmup_steps=0 \
+    --cache_latents \
+    --rank=4 \
+    --max_train_steps=500 \
+    --seed="0" \
+    --use_hpu_graphs_for_inference \
+    --use_hpu_graphs_for_training \
+    --gaudi_config_name="Habana/stable-diffusion"
 ```
 > [!NOTE]
 > To use DeepSpeed instead of MPI, replace `--use_mpi` with `--use_deepspeed` in the previous example
 
-After training completes, you can run inference on Gaudi system with a simple python script like this:
+After training completes, you could directly use `text_to_image_generation.py` sample for inference as follows:
+```bash
+python ../text_to_image_generation.py \
+    --model_name_or_path "black-forest-labs/FLUX.1-dev" \
+    --lora_id dog_lora_flux \
+    --prompts "A picture of a sks dog in a bucket" \
+    --num_images_per_prompt 5 \
+    --batch_size 1 \
+    --image_save_dir /tmp/flux_images \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
+    --bf16
+```
+
+Alternatively, you can run inference on Gaudi system with a simple Python script like this:
 ```python
 import torch
 from optimum.habana import GaudiConfig
@@ -669,6 +722,7 @@ pipe = GaudiFluxPipeline.from_pretrained(
     use_hpu_graphs=True,
     use_habana=True,
     gaudi_config="Habana/stable-diffusion",
+    sdp_on_bf16=True,
 )
 pipe.load_lora_weights("dog_lora_flux")
 
@@ -682,18 +736,3 @@ image = pipe(
 ).images[0]
 image.save("flux-dev.png")
 ```
-
-Alternatively, you could directly use `text_to_image_generation.py` sample for inference as follows:
-```bash
-python ../text_to_image_generation.py \
-    --model_name_or_path "black-forest-labs/FLUX.1-dev" \
-    --lora_id dog_lora_flux \
-    --prompts "A picture of a sks dog in a bucket" \
-    --num_images_per_prompt 5 \
-    --batch_size 1 \
-    --image_save_dir /tmp/flux_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
diff --git a/examples/stable-diffusion/training/download_train_datasets.py b/examples/stable-diffusion/training/download_train_datasets.py
new file mode 100755
index 0000000000..6ff500c9ef
--- /dev/null
+++ b/examples/stable-diffusion/training/download_train_datasets.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+import shutil
+from pathlib import Path
+
+from huggingface_hub import hf_hub_download, snapshot_download
+
+
+# Download Cat-Toy example dataset
+local_dir = "./cat"
+snapshot_download(
+    repo_id="diffusers/cat_toy_example",
+    local_dir=local_dir,
+    repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+cache_dir = Path(local_dir, ".cache")
+if cache_dir.is_dir():
+    shutil.rmtree(cache_dir)
+
+# Download Dog example dataset
+local_dir = "./dog"
+snapshot_download(
+    repo_id="diffusers/dog-example",
+    local_dir=local_dir,
+    repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+cache_dir = Path(local_dir, ".cache")
+if cache_dir.is_dir():
+    shutil.rmtree(cache_dir)
+
+# Download ControlNet example images
+local_dir = "./cnet"
+file_path1 = hf_hub_download(
+    repo_id="huggingface/documentation-images",
+    subfolder="diffusers/controlnet_training",
+    filename="conditioning_image_1.png",
+    repo_type="dataset",
+    local_dir=local_dir,
+)
+file_path2 = hf_hub_download(
+    repo_id="huggingface/documentation-images",
+    subfolder="diffusers/controlnet_training",
+    filename="conditioning_image_2.png",
+    repo_type="dataset",
+    local_dir=local_dir,
+)
+shutil.move(file_path1, local_dir)
+shutil.move(file_path2, local_dir)
+cache_dir = Path(local_dir, ".cache")
+if cache_dir.is_dir():
+    shutil.rmtree(cache_dir)
+sub_dir = Path(local_dir, "diffusers")
+if sub_dir.is_dir():
+    shutil.rmtree(sub_dir)
diff --git a/examples/stable-diffusion/training/requirements.txt b/examples/stable-diffusion/training/requirements.txt
index bf92040ae8..a28ca499b5 100644
--- a/examples/stable-diffusion/training/requirements.txt
+++ b/examples/stable-diffusion/training/requirements.txt
@@ -1,3 +1,4 @@
 imagesize
 peft == 0.10.0
 sentencepiece
+compel
diff --git a/examples/stable-diffusion/training/textual_inversion.py b/examples/stable-diffusion/training/textual_inversion.py
index db488f8749..2f465699b3 100755
--- a/examples/stable-diffusion/training/textual_inversion.py
+++ b/examples/stable-diffusion/training/textual_inversion.py
@@ -130,6 +130,7 @@ def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight
         use_habana=True,
         use_hpu_graphs=True,
         gaudi_config=args.gaudi_config_name,
+        sdp_on_bf16=args.sdp_on_bf16,
     )
     pipeline.scheduler = GaudiDDIMScheduler.from_config(pipeline.scheduler.config)
     pipeline.set_progress_bar_config(disable=True)
@@ -415,6 +416,9 @@ def parse_args():
         default=None,
         help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
     parser.add_argument(
         "--throughput_warmup_steps",
         type=int,
@@ -883,7 +887,7 @@ def main():
                 htcore.mark_step()
 
                 # Let's make sure we don't update any embedding weights besides the newly added token
-                index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool)
+                index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool, device=accelerator.device)
                 index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
 
                 with torch.no_grad():
diff --git a/examples/stable-diffusion/training/textual_inversion_sdxl.py b/examples/stable-diffusion/training/textual_inversion_sdxl.py
old mode 100644
new mode 100755
index 608ee481ad..3ab6c57602
--- a/examples/stable-diffusion/training/textual_inversion_sdxl.py
+++ b/examples/stable-diffusion/training/textual_inversion_sdxl.py
@@ -392,6 +392,9 @@ def parse_args():
         default=None,
         help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
     parser.add_argument(
         "--throughput_warmup_steps",
         type=int,
@@ -623,6 +626,7 @@ def main():
         use_habana=True,
         use_hpu_graphs=True,
         gaudi_config=args.gaudi_config_name,
+        sdp_on_bf16=args.sdp_on_bf16,
     )
     text_encoder_1 = pipeline.text_encoder.to(accelerator.device)
     text_encoder_2 = pipeline.text_encoder_2.to(accelerator.device)
@@ -918,9 +922,9 @@ def main():
                 htcore.mark_step()
 
                 # Let's make sure we don't update any embedding weights besides the newly added token
-                index_no_updates = torch.ones((len(tokenizer_1),), dtype=torch.bool)
+                index_no_updates = torch.ones((len(tokenizer_1),), dtype=torch.bool, device=accelerator.device)
                 index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
-                index_no_updates_2 = torch.ones((len(tokenizer_2),), dtype=torch.bool)
+                index_no_updates_2 = torch.ones((len(tokenizer_2),), dtype=torch.bool, device=accelerator.device)
                 index_no_updates_2[min(placeholder_token_ids_2) : max(placeholder_token_ids_2) + 1] = False
 
                 with torch.no_grad():
diff --git a/examples/stable-diffusion/training/train_controlnet.py b/examples/stable-diffusion/training/train_controlnet.py
index e676ae6ddf..004cee5af5 100755
--- a/examples/stable-diffusion/training/train_controlnet.py
+++ b/examples/stable-diffusion/training/train_controlnet.py
@@ -68,7 +68,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 if is_wandb_available():
     import wandb
 
@@ -120,6 +120,7 @@ def log_validation(
         use_habana=True,
         use_hpu_graphs=args.use_hpu_graphs,
         gaudi_config=gaudi_config,
+        sdp_on_bf16=args.sdp_on_bf16,
     )
     pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
     pipeline = pipeline.to(accelerator.device)
@@ -438,6 +439,12 @@ def parse_args(input_args=None):
             ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
         ),
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--bf16",
         action="store_true",
diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_flux.py b/examples/stable-diffusion/training/train_dreambooth_lora_flux.py
index 68b5320d19..1117d0a43f 100755
--- a/examples/stable-diffusion/training/train_dreambooth_lora_flux.py
+++ b/examples/stable-diffusion/training/train_dreambooth_lora_flux.py
@@ -784,7 +784,7 @@ def load_model_hook(models, input_dir):
         lora_state_dict = FluxPipeline.lora_state_dict(input_dir)
 
         transformer_state_dict = {
-            f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
+            f"{k.replace('transformer.', '')}": v for k, v in lora_state_dict.items() if k.startswith("transformer.")
         }
         transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
         incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
index b177cf12e6..4e96ee8e0d 100755
--- a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
+++ b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
@@ -94,7 +94,7 @@ def save_model_card(
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
         img_str += f"""
-        - text: '{validation_prompt if validation_prompt else ' ' }'
+        - text: '{validation_prompt if validation_prompt else " "}'
           output:
             url:
                 "image_{i}.png"
@@ -1083,7 +1083,7 @@ def load_model_hook(models, input_dir):
 
         lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
 
-        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = {f"{k.replace('unet.', '')}": v for k, v in lora_state_dict.items() if k.startswith("unet.")}
         unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
         incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
         if incompatible_keys is not None:
diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
index c9d84ae1b9..7bb96e51a1 100755
--- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py
+++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
@@ -491,6 +491,12 @@ def parse_args(input_args=None):
         default=False,
         help=("Whether to use bf16 mixed precision."),
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--local_rank",
         type=int,
@@ -878,9 +884,9 @@ def main(args):
     # download the dataset.
     if args.dataset_name is not None:
         if len(args.mediapipe) > 0:
-            assert (
-                args.resolution == args.crop_resolution
-            ), f"To use hardware pipe, --resolution ({args.resolution}) must equal --crop_resolution ({args.crop_resolution})"
+            assert args.resolution == args.crop_resolution, (
+                f"To use hardware pipe, --resolution ({args.resolution}) must equal --crop_resolution ({args.crop_resolution})"
+            )
             if args.local_rank == 0:
                 if not os.path.exists(args.mediapipe):
                     os.mkdir(args.mediapipe)
@@ -1421,6 +1427,7 @@ def compute_time_ids(original_size, crops_coords_top_left):
                         use_habana=True,
                         use_hpu_graphs=args.use_hpu_graphs_for_inference,
                         gaudi_config=args.gaudi_config_name,
+                        sdp_on_bf16=args.sdp_on_bf16,
                     )
                 else:
                     # vae and text encoders are frozen, only need to update unet
@@ -1525,7 +1532,7 @@ def compute_time_ids(original_size, crops_coords_top_left):
                     image_save_dir.mkdir(parents=True, exist_ok=True)
                     logger.info(f"Saving images in {image_save_dir.resolve()}...")
                     for i, image in enumerate(images):
-                        image.save(image_save_dir / f"image_{epoch}_{i+1}.png")
+                        image.save(image_save_dir / f"image_{epoch}_{i + 1}.png")
                 else:
                     logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")
 
diff --git a/examples/stable-diffusion/unconditional_image_generation.py b/examples/stable-diffusion/unconditional_image_generation.py
index 36e35ff90f..bd70d0e4d6 100755
--- a/examples/stable-diffusion/unconditional_image_generation.py
+++ b/examples/stable-diffusion/unconditional_image_generation.py
@@ -20,7 +20,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 # Setup logging
 logging.basicConfig(
@@ -68,6 +68,12 @@ def main():
         action="store_true",
         help="Whether to use bf16 precision for classification.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        default=False,
+        help="Allow pyTorch to use reduced precision in the SDPA math backend",
+    )
     parser.add_argument(
         "--save_outputs",
         action="store_true",
@@ -104,6 +110,7 @@ def main():
         "use_habana": args.use_habana,
         "use_hpu_graphs": args.use_hpu_graphs,
         "gaudi_config": gaudi_config,
+        "sdp_on_bf16": args.sdp_on_bf16,
     }
 
     kwargs_call = {"throughput_warmup_steps": args.throughput_warmup_steps}
diff --git a/examples/summarization/README.md b/examples/summarization/README.md
index 745b293d69..bdaef78edf 100644
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
@@ -179,65 +179,8 @@ python ../gaudi_spawn.py \
 
 ## Using DeepSpeed
 
-Here is an example on 8 HPUs on Gaudi2/Gaudi3 with DeepSpeed-ZeRO3 to fine-tune [FLAN-T5 XXL](https://huggingface.co/google/flan-t5-xxl):
-```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=512 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_summarization.py \
-    --model_name_or_path google/flan-t5-xxl \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config '"3.0.0"' \
-    --source_prefix '"summarize: "' \
-    --output_dir ./tst-summarization \
-    --per_device_train_batch_size 22 \
-    --per_device_eval_batch_size 22 \
-    --learning_rate 1e-4 \
-    --num_train_epochs 3 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --generation_max_length 129 \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --gradient_checkpointing \
-    --adam_epsilon 1e-08 --logging_steps 1 \
-    --deepspeed ds_flan_t5_z3_config_bf16.json
-```
-
-Here is an example on 8 HPUs on Gaudi2 with DeepSpeed-ZeRO2 to fine-tune t5-large:
-```bash
-PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
-      --world_size 8 \
-      --use_deepspeed run_summarization.py \
-      --deepspeed ../../tests/configs/deepspeed_zero_2.json \
-      --do_train \
-      --do_eval \
-      --overwrite_output_dir \
-      --predict_with_generate \
-      --use_habana \
-      --gaudi_config_name Habana/t5  \
-      --ignore_pad_token_for_loss False \
-      --pad_to_max_length \
-      --save_strategy no \
-      --throughput_warmup_steps 15 \
-      --model_name_or_path t5-large \
-      --source_prefix '"summarize:"' \
-      --dataset_name cnn_dailymail \
-      --dataset_config '"3.0.0"' \
-      --output_dir /tmp/tst-summarization \
-      --per_device_train_batch_size 20 \
-      --per_device_eval_batch_size 20 \
-      --max_train_samples 2000  \
-      --torch_compile_backend hpu_backend \
-      --torch_compile
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
+You can check the [DeepSpeed](https://github.com/huggingface/optimum-habana/tree/main/examples#deepspeed) section in Optimum Habana examples for how to run DeepSpeed.
+You also can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
 
 
 ## Inference
@@ -262,26 +205,8 @@ python run_summarization.py \
     --gaudi_config_name Habana/t5 \
     --ignore_pad_token_for_loss False \
     --pad_to_max_length \
+    --throughput_warmup_steps 3 \
     --bf16 \
     --bf16_full_eval
 ```
 
-You can run inference with BART on the CNN-DailyMail dataset on 1 Gaudi card with the following command:
-```bash
-python run_summarization.py \
-    --model_name_or_path facebook/bart-large-cnn \
-    --do_predict \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization \
-    --per_device_eval_batch_size 2 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/bart \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --num_beams 1
-```
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 28498fc0a2..a14e0e1dea 100755
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -66,7 +66,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -559,9 +559,9 @@ def main():
         return
 
     if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert (
-            data_args.lang is not None
-        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        assert data_args.lang is not None, (
+            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        )
 
         tokenizer.src_lang = data_args.lang
         tokenizer.tgt_lang = data_args.lang
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 57bf7cbb05..6a78ecd91e 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -58,7 +58,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -168,9 +168,9 @@ def __post_init__(self):
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
             validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            assert validation_extension == train_extension, (
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )
 
 
 @dataclass
@@ -338,9 +338,9 @@ def main():
             if data_args.test_file is not None:
                 train_extension = data_args.train_file.split(".")[-1]
                 test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                assert test_extension == train_extension, (
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                 data_files["test"] = data_args.test_file
             else:
                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
diff --git a/examples/text-feature-extraction/README.md b/examples/text-feature-extraction/README.md
index 9c34ede54a..e46168840b 100644
--- a/examples/text-feature-extraction/README.md
+++ b/examples/text-feature-extraction/README.md
@@ -28,12 +28,6 @@ python run_feature_extraction.py \
         "BERT is a common machine learning architecture for text-based applications." \
         "Alexander Hamilton is one of the founding fathers of the United States." \
     --use_hpu_graphs \
+    --sdp_on_bf16 \
     --bf16
 ```
-
-Models that have been validated:
-
-- [Supabase/gte-small](https://huggingface.co/Supabase/gte-small)
-- [thenlper/gte-small](https://huggingface.co/thenlper/gte-small)
-- [thenlper/gte-base](https://huggingface.co/thenlper/gte-base)
-- [thenlper/gte-large](https://huggingface.co/thenlper/gte-large)
diff --git a/examples/text-feature-extraction/run_feature_extraction.py b/examples/text-feature-extraction/run_feature_extraction.py
index 47320b1979..159f36b488 100644
--- a/examples/text-feature-extraction/run_feature_extraction.py
+++ b/examples/text-feature-extraction/run_feature_extraction.py
@@ -83,6 +83,9 @@ def parse_args():
         action="store_true",
         help="Whether to perform generation in bf16 precision.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
     parser.add_argument(
         "--warmup",
         type=int,
@@ -100,6 +103,8 @@ def parse_args():
 
 def main():
     args = parse_args()
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
     model = AutoModel.from_pretrained(args.model_name_or_path).to("hpu")
diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index bd81c7f7fe..5d399f65dd 100755
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -33,7 +33,7 @@ pip install -r requirements_lm_eval.txt
 
 Then, if you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html) (e.g. to use BLOOM/BLOOMZ), you should install DeepSpeed as follows:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 
@@ -79,7 +79,8 @@ python run_generation.py \
 --use_kv_cache \
 --max_new_tokens 100 \
 --do_sample \
---prompt "Here is my prompt"
+--prompt "Here is my prompt" \
+--sdp_on_bf16
 ```
 
 If you want to provide several prompts as inputs, here is how to do it:
@@ -91,7 +92,8 @@ python run_generation.py \
 --max_new_tokens 100 \
 --do_sample \
 --batch_size 2 \
---prompt "Hello world" "How are you?"
+--prompt "Hello world" "How are you?" \
+--sdp_on_bf16
 ```
 
 > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size.
@@ -110,7 +112,8 @@ python run_generation.py \
 --use_kv_cache \
 --num_return_sequences 1 \
 --temperature 0 \
---prompt "Alice and Bob"
+--prompt "Alice and Bob" \
+--sdp_on_bf16
 ```
 
 ### Benchmark
@@ -138,7 +141,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --batch_size 1 \
 --use_hpu_graphs \
 --use_kv_cache \
---max_new_tokens 100
+--max_new_tokens 100 \
+--sdp_on_bf16
 ```
 
 You can also run Llama2-70B on Gaudi2 with all optimizations enabled using the following command:
@@ -153,7 +157,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --attn_softmax_bf16 \
 --limit_hpu_graphs \
 --reuse_cache \
---trim_logits
+--trim_logits \
+--sdp_on_bf16
 ```
 
 To run Falcon-7B inference, use the following command:
@@ -165,7 +170,8 @@ python run_generation.py \
  --use_kv_cache \
  --batch_size 1 \
  --max_new_tokens 128 \
- --do_sample
+ --do_sample \
+ --sdp_on_bf16
 ```
 
 To run Falcon-40B inference on 8 Gaudi2 cards, use the following command:
@@ -182,6 +188,20 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --flash_attention_causal_mask
 ```
 
+To run Llama3-405B inference on 8 Gaudi3 cards use the following command:
+```bash
+python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
+--model_name_or_path meta-llama/Llama-3.1-405B-Instruct \
+--max_new_tokens 2048 \
+--bf16 \
+--use_hpu_graphs \
+--use_kv_cache \
+--batch_size 1 \
+--do_sample \
+--use_flash_attention \
+--flash_attention_causal_mask
+```
+
 > To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should:
 > - have a HF account
 > - agree to the terms of use of the model in its model card on the HF Hub
@@ -196,7 +216,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 > --use_hpu_graphs \
 > --use_kv_cache \
 > --max_new_tokens 100 \
-> --bf16
+> --bf16 \
+> --sdp_on_bf16
 > ```
 
 ### Use any dataset from the Hugging Face Hub
@@ -215,7 +236,8 @@ python run_generation.py \
 --use_kv_cache \
 --dataset_name JulesBelveze/tldr_news \
 --column_name content \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 > The prompt length is limited to 16 tokens. Prompts longer than this will be truncated.
@@ -234,7 +256,8 @@ python run_generation.py \
 --bf16 \
 --max_new_tokens 100 \
 --prompt "Here is my prompt" \
---peft_model yard1/llama-2-7b-sql-lora-test
+--peft_model yard1/llama-2-7b-sql-lora-test \
+--sdp_on_bf16
 ```
 
 ### Using growing bucket optimization
@@ -330,7 +353,7 @@ PT_ENABLE_INT64_SUPPORT=1 PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py  --world_s
 
 ### Running with FP8
 
-Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B and phi-2 in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
+Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B, phi-2 and Llama3-405B in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
 
 More information on enabling fp8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
@@ -453,6 +476,44 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
 --flash_attention_causal_mask
 ```
 
+Here is an example to measure the tensor quantization statistics on Llama3-405B with 8 cards:
+> Please note that Llama3-405B requires minimum 16 cards Gaudi2 and 8 cards Gaudi3.
+```bash
+QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python ../gaudi_spawn.py \
+--use_deepspeed --world_size 8 run_lm_eval.py \
+-o acc_llama3_405b_bs1_quant.txt \
+--model_name_or_path meta-llama/Llama-3.1-405B-Instruct \
+--use_hpu_graphs \
+--use_kv_cache \
+--trim_logits \
+--batch_size 1 \
+--bf16 \
+--reuse_cache \
+--use_flash_attention \
+--flash_attention_recompute \
+--flash_attention_causal_mask
+```
+
+Here is an example to quantize the model based on previous measurements for Llama3-405B with 8 cards:
+> Please note that Llama3-405B requires minimum 16 cards Gaudi2 and 8 cards Gaudi3.
+```bash
+QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
+--use_deepspeed --world_size 8 run_generation.py \
+--model_name_or_path meta-llama/Llama-3.1-405B-Instruct \
+--use_hpu_graphs \
+--use_kv_cache \
+--limit_hpu_graphs \
+--max_input_tokens 2048 \
+--max_new_tokens 2048 \
+--batch_size 2 \
+--bf16 \
+--reuse_cache \
+--trim_logits \
+--use_flash_attention \
+--flash_attention_recompute \
+--flash_attention_causal_mask
+```
+
 Here is an example to measure the tensor quantization statistics on phi-2 with 1 card:
 
 ```bash
@@ -491,7 +552,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_generation.py
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for gemma with 1 card:
@@ -503,7 +565,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_gemma.json python run_generation
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 
@@ -513,14 +576,13 @@ Some bf16 models don't fit on one card due to hpu memory limitation, but in fp8
 As measurement is being calculated in bf16 precision, to be able to run fp8 model on single card you should use `unify_measurements` script.
 Here are the steps:
 1. Measure the model on a number of cards that are enough for the model to fit in BF16.
-2. Quantize the model on the same amount of cards for scales to be saved.
-3. Run unify_measurements.py script using the measurement files created after running steps 1 and 2. A unified measurement is then calculated.
+2. Run unify_measurements.py script using the measurement files created in step 1. A unified measurement is then calculated.
 ```bash
 python quantization_tools/unify_measurements.py -g 01234567 -m *path_to_8x_measurements* -o *path_to_output_1x_measurement*
 ```
 In the above example, the measurements of cards 0-7 will be unified to a single measurement. For example, if you specify `-g 0123 4567`,
 cards 0-3 and cards 4-7 will be unified in two different measurement files. All different group combinations are supported.
-4. Run quantization using the unified measurement file/s.
+3. Run quantization using the unified measurement file/s.
 
 More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
 
@@ -568,10 +630,8 @@ https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UI
 
 Below is an example to load a model with 4bit checkpoints from Hugging Face.
 Please note that model name is denoted as `<model_path_in_hugging_face>`.
-Additionally, the below env vars are used for performance optimizations, and are planned to be removed in future version:
-`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1`
+
 ```bash
-SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 \
 python run_lm_eval.py \
 -o acc_load_uint4_model.txt \
 --model_name_or_path <model_path_in_hugging_face> \
@@ -594,12 +654,10 @@ Currently, only uint4 checkpoints and single-device configurations are supported
 More information on enabling 4-bit inference in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UINT4.html?highlight=inference%20using%20int4#enabling-and-running-uint4-in-pytorch-models.
 
-Below is an example of loading a llama7b model with a 4bit checkpoint quantized in INC.
+Below is an example of loading a llama2-7b model with a 4bit checkpoint quantized in INC.
 Please note that the model checkpoint name is denoted as `<local_model_path_from_inc>`.
-Additionally, the following environment variables are used for performance optimizations and are planned to be removed in future versions:
-`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1`
+
 ```bash
-SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 \
 python run_lm_eval.py \
 -o acc_load_uint4_model.txt \
 --model_name_or_path meta-llama/Llama-2-7b-hf \
@@ -644,26 +702,18 @@ For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Mo
 
 ### Running with UINT4 weight quantization using AutoGPTQ
 
-
-Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ Fork](https://github.com/HabanaAI/AutoGPTQ), which provides quantization capabilities in PyTorch.
+Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ), which provides quantization capabilities in PyTorch.
 Currently, the support is for UINT4 inference of pre-quantized models only.
 
 ```bash
-BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
+BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/AutoGPTQ/AutoGPTQ
 ```
 
-You can run a *UINT4 weight quantized* model using AutoGPTQ by setting the following environment variables:
-`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=true` before running the command,
-and by adding the argument `--load_quantized_model_with_autogptq`.
-
-***Note:***
-Setting the above environment variables improves performance. These variables will be removed in future releases.
-
+You can run a *UINT4 weight quantized* model using AutoGPTQ by adding the argument `--load_quantized_model_with_autogptq`.
 
 Here is an example to run a quantized model <quantized_gptq_model>:
 ```bash
-SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false \
-ENABLE_EXPERIMENTAL_FLAGS=true python run_generation.py \
+python run_generation.py \
 --attn_softmax_bf16 \
 --model_name_or_path <quantized_gptq_model> \
 --use_hpu_graphs \
@@ -678,6 +728,36 @@ ENABLE_EXPERIMENTAL_FLAGS=true python run_generation.py \
 --load_quantized_model_with_autogptq
 ```
 
+### Running with UINT4 weight quantization using AutoAWQ
+
+Llama2-7b supports UINT4 weight-only quantization through [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), which offers quantization capabilities in PyTorch.
+Currently, this support is limited to UINT4 inference of pre-quantized models only.
+
+Please run the following command to install AutoAWQ:
+```bash
+pip install triton==3.1.0 autoawq
+```
+
+You can run a *UINT4 weight quantized* model using AutoAWQ by including the argument `--load_quantized_model_with_autoawq`.
+
+Here is an example of how to run a quantized model <quantized_awq_model>:
+```bash
+python run_generation.py \
+--attn_softmax_bf16 \
+--model_name_or_path <quantized_awq_model> \
+--use_hpu_graphs \
+--limit_hpu_graphs \
+--use_kv_cache \
+--bucket_size 128 \
+--bucket_internal \
+--trim_logits \
+--max_new_tokens 128 \
+--batch_size 1 \
+--bf16 \
+--load_quantized_model_with_autoawq
+```
+
+
 ## Language Model Evaluation Harness
 
 The evaluation of LLMs can be done using the `lm_eval.py` script. It utilizes the [LM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness)
diff --git a/examples/text-generation/quantization_config/maxabs_quant_arbitrary.json b/examples/text-generation/quantization_config/maxabs_quant_arbitrary.json
new file mode 100644
index 0000000000..0de4d35533
--- /dev/null
+++ b/examples/text-generation/quantization_config/maxabs_quant_arbitrary.json
@@ -0,0 +1,7 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_arbitrary",
+    "dump_stats_path": "./hqt_output/measure"
+}
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/maxabs_quant_mixtral.json b/examples/text-generation/quantization_config/maxabs_quant_mixtral.json
index 87dc52d08a..caaff8d09e 100644
--- a/examples/text-generation/quantization_config/maxabs_quant_mixtral.json
+++ b/examples/text-generation/quantization_config/maxabs_quant_mixtral.json
@@ -3,10 +3,7 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "maxabs_hw",
-    "allowlist": {"types": [], "names":  ["gate","w1","w3","w2"]},
-    "blocklist": {"types": [], "names":  [
-        "model.layers.1.block_sparse_moe.experts.(3|4).w2",
-        "model.layers.[29-31].block_sparse_moe.experts.[0-7].w2"
-    ]},
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  ["self_attn"]},
     "dump_stats_path": "./hqt_output/measure"
 }
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/pow2_quant.json b/examples/text-generation/quantization_config/pow2_quant.json
new file mode 100644
index 0000000000..e1f2eb1c6e
--- /dev/null
+++ b/examples/text-generation/quantization_config/pow2_quant.json
@@ -0,0 +1,7 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_pow2",
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/text-generation/quantization_config/weight_opt_quant.json b/examples/text-generation/quantization_config/weight_opt_quant.json
new file mode 100644
index 0000000000..1ec2dc6b6a
--- /dev/null
+++ b/examples/text-generation/quantization_config/weight_opt_quant.json
@@ -0,0 +1,7 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw_opt_weight",
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py
index 4282e4ac49..de2b086c2a 100644
--- a/examples/text-generation/quantization_tools/unify_measurements.py
+++ b/examples/text-generation/quantization_tools/unify_measurements.py
@@ -6,49 +6,45 @@
 import numpy as np
 
 
-def find_measurement_path(measurement, measurements_dir_path, scales, group_size):
+def find_measurement_path(measurement, measurements_dir_path, group_size):
     measurment_card = measurement + "_" + str(group_size)
     for measurment_file in os.listdir(measurements_dir_path):
         filename = os.fsdecode(measurment_file)
         if not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename:
             continue
-        if scales:
-            if "MAXABS" in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
-        else:
-            if "MAXABS" not in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
 
+        if "MAXABS" not in filename:
+            return os.path.join(measurements_dir_path, measurment_file)
 
-def unify_measurements(
-    measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index, scales=False
-):
+
+def unify_measurements(measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index):
     measurements_paths = []
     group_name = ""
 
     # save all the jsons paths in the given measurement group
     for measurement in measurement_group:
-        measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, groups_size)
-        measurements_paths.append(measurement_path)
+        measurement_path = find_measurement_path(measurement, measurements_dir_path, groups_size)
+        if measurement_path is not None:
+            measurements_paths.append(measurement_path)
         group_name += measurement
-
     # save all the jsons content in the given measurement group
     measurements_jsons = []
     for measurement_path in measurements_paths:
-        with open(measurement_path, "r") as f:
-            js = json.load(f)
-            measurements_jsons.append(js["Nodes"])
+        if measurement_path is not None:
+            with open(measurement_path, "r") as f:
+                js = json.load(f)
+                measurements_jsons.append(js["Nodes"])
     # create a name for the unified json that will be created for this measurement group
 
     if groups_num == 1:
         unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
+            find_measurement_path(measurement_group[0], measurements_dir_path, groups_size)
             .split("/")[-1]
             .replace("_" + measurement_group[0] + "_" + str(groups_size), "")
         )
     else:
         unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
+            find_measurement_path(measurement_group[0], measurements_dir_path, groups_size)
             .split("/")[-1]
             .replace(
                 "_" + measurement_group[0] + "_" + str(groups_size), "_" + str(group_index) + "_" + str(groups_num)
@@ -74,70 +70,27 @@ def unify_measurements(
             max_weight = node_values["params"]["weight"]
 
         # iterate over all the measurment group and take the maximum for each tensor and its channel
-        if scales:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i])
-                if max_outputs is not None:
-                    if isinstance(max_outputs[0], list):
-                        for i in range(0, len(max_outputs)):
-                            for j in range(0, len(max_outputs[i])):
-                                max_outputs[i][j] = max(
-                                    measurement_json[node_name]["outputs"][i][j], max_outputs[i][j]
-                                )
-                    else:
-                        for i in range(0, len(max_outputs)):
-                            max_outputs[i] = max(measurement_json[node_name]["outputs"][i], max_outputs[i])
-                if max_weight is not None:
-                    if isinstance(max_weight, dict):
-                        for key, values in max_weight.items():
-                            for i in range(0, len(values)):
-                                max_weight[key][i] = max(
-                                    measurement_json[node_name]["params"]["weight"][key][i], max_weight[key][i]
-                                )
-                    else:
-                        max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
-        else:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    for j in range(0, len(max_inputs[i])):
-                        max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
-                if max_outputs is not None:
-                    for i in range(0, len(max_outputs)):
-                        max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
-                if max_weight is not None:
-                    for i in range(0, len(max_weight)):
-                        max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
-
-        # update the maximum in the unified json
-        if scales:
-            for i in range(0, len(max_inputs)):
-                unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i]
-            if max_outputs is not None:
-                if isinstance(max_outputs[0], list):
-                    for i in range(0, len(max_outputs)):
-                        for j in range(0, len(max_outputs[i])):
-                            unified_json["Nodes"][node_name]["outputs"][i][j] = max_outputs[i][j]
-                else:
-                    for i in range(0, len(max_outputs)):
-                        unified_json["Nodes"][node_name]["outputs"][i] = max_outputs[i]
-            if max_weight is not None:
-                if isinstance(max_weight, dict):
-                    for key, values in max_weight.items():
-                        for i in range(0, len(values)):
-                            unified_json["Nodes"][node_name]["params"]["weight"][key][i] = max_weight[key][i]
-                else:
-                    unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
-        else:
+        for measurement_json in measurements_jsons:
             for i in range(0, len(max_inputs)):
                 for j in range(0, len(max_inputs[i])):
-                    unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+                    max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
             if max_outputs is not None:
                 for i in range(0, len(max_outputs)):
-                    unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+                    max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
             if max_weight is not None:
                 for i in range(0, len(max_weight)):
-                    unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
+                    max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
+
+        # update the maximum in the unified json
+        for i in range(0, len(max_inputs)):
+            for j in range(0, len(max_inputs[i])):
+                unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+        if max_outputs is not None:
+            for i in range(0, len(max_outputs)):
+                unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+        if max_weight is not None:
+            for i in range(0, len(max_weight)):
+                unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
     global_rank = None
     local_rank = group_index if groups_num != 1 else -1
     mode = ""
@@ -153,10 +106,10 @@ def unify_measurements(
         layers[layer] = {}
         layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
         if dlayer.get("outputs") is not None:
-            layers[layer]["outputs"] = np.array(dlayer["outputs"])
+            layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]]
         if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None:
             layers[layer]["params"] = {}
-            layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
+            layers[layer]["params"]["weight"] = [np.array(x) for x in dlayer["params"]["weight"]]
     df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers}
     with open(unified_npz_path, "w"):
         np.savez(unified_npz_path, df)
@@ -196,26 +149,14 @@ def main(args):
     groups = args.groups
 
     num_jsons_drange = 0
-    num_jsons_scales = 0
     for path in os.listdir(measurements_path):
-        if path.endswith(".json"):
-            if "MAXABS" in path:
-                num_jsons_scales += 1
-            elif "mod_list" not in path:
-                num_jsons_drange += 1
-    assert (
-        os.path.isdir(measurements_path)
-        and (num_jsons_drange % len(groups)) == 0
-        and (num_jsons_scales % len(groups)) == 0
-    )
+        if path.endswith(".json") and "MAXABS" not in path and "mod_list" not in path:
+            num_jsons_drange += 1
+
+    assert os.path.isdir(measurements_path) and (num_jsons_drange % len(groups)) == 0
 
     for group_index, group in enumerate(groups):
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_drange, len(groups), group_index, scales=False
-        )
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_scales, len(groups), group_index, scales=True
-        )
+        unify_measurements(group, measurements_path, output_path, num_jsons_drange, len(groups), group_index)
 
     print("finished measurement unifier script")
 
diff --git a/examples/text-generation/requirements_lm_eval.txt b/examples/text-generation/requirements_lm_eval.txt
index e632dc1236..322cf204bf 100644
--- a/examples/text-generation/requirements_lm_eval.txt
+++ b/examples/text-generation/requirements_lm_eval.txt
@@ -1,2 +1 @@
-https://github.com/EleutherAI/lm-evaluation-harness/archive/0bf683b4e6a9df359b3156ba9ba8d62bdd47e0c0.zip
-datasets==2.21.0
+https://github.com/EleutherAI/lm-evaluation-harness/archive/refs/tags/v0.4.7.zip
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index a20783511d..274aa591cd 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -226,6 +226,11 @@ def setup_parser(parser):
         action="store_true",
         help="Skip HPU Graph usage for first token to save memory",
     )
+    parser.add_argument(
+        "--clear_hpu_graphs_cache",
+        action="store_true",
+        help="Clear HPU graphs cache",
+    )
     parser.add_argument(
         "--show_graphs_count",
         action="store_true",
@@ -320,6 +325,9 @@ def setup_parser(parser):
         action="store_true",
         help="Run the inference with dataset for specified --n_iterations(default:5)",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
 
     quant_parser_group = parser.add_mutually_exclusive_group()
     quant_parser_group.add_argument(
@@ -327,6 +335,11 @@ def setup_parser(parser):
         action="store_true",
         help="Load an AutoGPTQ quantized checkpoint using AutoGPTQ.",
     )
+    quant_parser_group.add_argument(
+        "--load_quantized_model_with_autoawq",
+        action="store_true",
+        help="Load an AutoAWQ quantized checkpoint using AutoAWQ.",
+    )
     quant_parser_group.add_argument(
         "--disk_offload",
         action="store_true",
@@ -364,6 +377,8 @@ def setup_parser(parser):
     args.quant_config = os.getenv("QUANT_CONFIG", "")
     if args.quant_config and args.load_quantized_model_with_autogptq:
         raise RuntimeError("Setting both quant_config and load_quantized_model_with_autogptq is unsupported. ")
+    if args.quant_config and args.load_quantized_model_with_autoawq:
+        raise RuntimeError("Setting both quant_config and load_quantized_model_with_autoawq is unsupported. ")
 
     if args.quant_config == "" and args.disk_offload:
         logger.warning(
@@ -395,6 +410,9 @@ def main():
 
     import habana_frameworks.torch.hpu as torch_hpu
 
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     if args.dataset_name is None:
         # Benchmark over the prompts below
         if args.prompt:
@@ -526,7 +544,7 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 profiling_record_shapes=args.profiling_record_shapes,
             ).cpu()
             first_token_time = iteration_times[0] + encode_duration
-            logger.info(f"Time to first token = {first_token_time*1000}ms")
+            logger.info(f"Time to first token = {first_token_time * 1000}ms")
             return tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         from optimum.habana.utils import HabanaProfile
@@ -541,10 +559,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
         if dyn_prompt_lens is None or len(set(dyn_prompt_lens)) == 1:
             for i in range(args.warmup):
                 if dyn_prompt_lens is None:
-                    print(f"Warming up iteration {i+1}/{args.warmup}", flush=True)
+                    print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True)
                     generate(None, args.reduce_recompile)
                 else:
-                    print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i+1}/{args.warmup}", flush=True)
+                    print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True)
                     generate(dyn_prompt_lens[0], args.reduce_recompile)
         else:
             if args.bucket_size > 0:
@@ -559,7 +577,7 @@ def rounder(x):
                 for i in range(args.warmup):
                     lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
                     for sz in lst:
-                        print(f"Warming up for shape {sz - 1} iteration {i+1}/{args.warmup}", flush=True)
+                        print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True)
                         generate(sz - 1, args.reduce_recompile)
         torch_hpu.synchronize()
         compilation_duration = time.perf_counter() - t0
@@ -586,12 +604,12 @@ def rounder(x):
         all_inputs = []
         all_outputs = []
         for i, input_sentence in enumerate(zip(input_sentences)):
-            print(f"input {i+1}: {input_sentence}")
+            print(f"input {i + 1}: {input_sentence}")
             all_inputs.append(input_sentence)
             for j, output in enumerate(
                 zip(generated[args.num_return_sequences * i : args.num_return_sequences * (i + 1)])
             ):
-                print(f"output {i+1}.{j+1}: {output}")
+                print(f"output {i + 1}.{j + 1}: {output}")
                 all_outputs.append(output)
             print()
 
@@ -719,22 +737,21 @@ def generate_dataset(batch):
             return prompt, outputs
 
         # warmup
-        if prompt_length > 0:
-            from optimum.habana.utils import HabanaProfile
+        from optimum.habana.utils import HabanaProfile
 
-            # compilation stage disable profiling
-            HabanaProfile.disable()
-            # Compilation
-            logger.info("Graph compilation...")
-            t0 = time.perf_counter()
-            for i, batch in enumerate(dataloader):
-                generate_dataset(batch)
-                # The first three iterations take longer because of graph compilation
-                if (i + 1) == 3:
-                    break
-            torch_hpu.synchronize()
-            compilation_duration = time.perf_counter() - t0
-            HabanaProfile.enable()
+        # compilation stage disable profiling
+        HabanaProfile.disable()
+        # Compilation
+        logger.info("Graph compilation...")
+        t0 = time.perf_counter()
+        for i, batch in enumerate(dataloader):
+            generate_dataset(batch)
+            # The first three iterations take longer because of graph compilation
+            if (i + 1) == 3:
+                break
+        torch_hpu.synchronize()
+        compilation_duration = time.perf_counter() - t0
+        HabanaProfile.enable()
 
         total_new_tokens_generated = 0
         duration = 0
@@ -747,10 +764,10 @@ def generate_dataset(batch):
             duration += time.perf_counter() - t0
             total_new_tokens_generated += args.batch_size * args.max_new_tokens
             print(separator)
-            print(f"Batch n°{i+1}")
-            print(f"Input: {prompt[:args.batch_size]}")
+            print(f"Batch n°{i + 1}")
+            print(f"Input: {prompt[: args.batch_size]}")
             print(
-                f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[:args.batch_size*args.num_return_sequences]}"
+                f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[: args.batch_size * args.num_return_sequences]}"
             )
             print(separator)
             if args.run_partial_dataset and args.n_iterations == i + 1:
@@ -770,8 +787,7 @@ def generate_dataset(batch):
         mem = get_hpu_memory_stats()
         for k, v in mem.items():
             print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-        if prompt_length > 0:
-            print(f"Graph compilation duration          = {compilation_duration} seconds")
+        print(f"Graph compilation duration          = {compilation_duration} seconds")
         print(separator)
     if args.quant_config:
         finalize_quantization(model)
diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
index 689860fc7c..cb3ecd80f0 100644
--- a/examples/text-generation/run_lm_eval.py
+++ b/examples/text-generation/run_lm_eval.py
@@ -14,32 +14,33 @@
 # limitations under the License.
 
 ###############################################################################
-# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company
+# Copyright (C) 2020-2025 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
 import argparse
 import json
-import logging
 import multiprocessing as mp
 import os
 import time
+from typing import Literal, Optional
 
-import lm_eval.evaluator
-import lm_eval.tasks
 import psutil
 import torch
 import torch.nn.functional as F
+from lm_eval import evaluator, utils
+from lm_eval.models.huggingface import HFLM, TemplateLM
 
 # Local imports
 from run_generation import setup_parser
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
 from utils import finalize_quantization, initialize_model
 
 from optimum.habana.utils import get_hpu_memory_stats
 
 
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-logger = logging.getLogger(__name__)
-
+logger = utils.eval_logger
 
 # This hack is a workaround to limitations of lm_eval which always allocates
 # mp.Pool with max cpu count which explodes on multinode scenarios and for hpu
@@ -52,9 +53,7 @@ def LimitedSpawnPool(_):
     physical_cpu_count = psutil.cpu_count(logical=False)
     pool_size = physical_cpu_count
     world_size = int(os.getenv("WORLD_SIZE", 1))
-    if world_size == 0:
-        world_size = 1
-    pool_size //= world_size
+    pool_size //= max(world_size, 1)
     if (pool_size * world_size) != physical_cpu_count:
         pool_size -= 1
     return spawn_context.Pool(pool_size)
@@ -86,22 +85,52 @@ def setup_lm_eval_parser():
         default=["hellaswag", "lambada_openai", "piqa", "winogrande"],
     )
     parser.add_argument("--limit_iters", type=int, help="limit examples to run that many iterations", default=None)
+    parser.add_argument(
+        "--show_config",
+        action="store_true",
+        default=False,
+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
+    )
+    parser.add_argument("--max_graphs", type=int, help="Maximum number of HPU graphs", default=None)
     args = setup_parser(parser)
 
     return args
 
 
-class HabanaModelAdapter(lm_eval.base.BaseLM):
-    def __init__(self, tokenizer, model, args, options):
-        super().__init__()
+class HabanaModelAdapter(HFLM):
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        model: AutoModelForCausalLM,
+        args: argparse.Namespace,
+        options: GenerationConfig,
+        backend: Literal["default", "causal", "seq2seq"] = "default",
+        logits_cache: bool = True,
+        add_bos_token: Optional[bool] = False,
+        delta: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        # To skip cuda code of the HFLM init
+        TemplateLM.__init__(self)
         self.tokenizer = tokenizer
-        self.model = model
+        self._model = model
+        self._config = self._model.config
         self._batch_size = args.batch_size
-        self.buckets = sorted(args.buckets)
+        self.buckets: list[int] = sorted(args.buckets)
         self.options = options
-        self._device = args.device
+        self.device_ = args.device
+        self.pretrained = model
+        self.peft = args.peft_model
+        self.delta = delta
+        # determine which of 'causal' and 'seq2seq' backends to use for HF models
+        self._get_backend(config=self._config, backend=backend, trust_remote_code=args.trust_remote_code)
+        self.logits_cache = logits_cache
+        self.add_bos_token = add_bos_token
+        self._max_length = options.max_length
+        self.batch_size_per_gpu = int(args.batch_size)
+        self.revision = args.model_revision
         self.model_inputs = {"use_cache": self.options.use_cache}
-        if self.model.config.model_type in [
+        if self._model.config.model_type in [
             "llama",
             "mistral",
             "falcon",
@@ -118,8 +147,18 @@ def __init__(self, tokenizer, model, args, options):
                     "reuse_cache": self.options.reuse_cache,
                 }
             )
-        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2", "gemma", "baichuan"]:
-            if self.model.config.model_type != "falcon":
+
+        if self.model.config.model_type in [
+            "llama",
+            "mistral",
+            "qwen2",
+            "falcon",
+            "starcoder2",
+            "gemma",
+            "baichuan",
+            "gpt_bigcode",
+        ]:
+            if self.model.config.model_type not in ["falcon", "gpt_bigcode"]:
                 self.model_inputs.update(
                     {
                         "attn_softmax_bf16": self.options.attn_softmax_bf16,
@@ -132,59 +171,43 @@ def __init__(self, tokenizer, model, args, options):
                     "flash_attention_causal_mask": self.options.flash_attention_causal_mask,
                 }
             )
+            if self.model.config.model_type in ["llama", "qwen2", "baichuan", "gpt_bigcode"]:
+                self.model_inputs.update({"flash_attention_fast_softmax": self.options.flash_attention_fast_softmax})
         if args.warmup:
             self.warm_up()
 
-    def warm_up(self):
+    def warm_up(self) -> None:
         for bucket_size in reversed(self.buckets):
             inps = torch.ones((self._batch_size, bucket_size), dtype=torch.int64)
             self._model_call(inps)
-            pass
 
     @property
-    def eot_token_id(self):
-        return self.model.config.eos_token_id
+    def eot_token_id(self) -> int:
+        return self._model.config.eos_token_id
 
     @property
-    def max_length(self):
+    def max_length(self) -> int:
         return self.buckets[-1]
 
-    @property
-    def max_gen_toks(self):
-        raise NotImplementedError()
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
     @property
     def device(self):
         # We need to do padding ourselves, otherwise we'll end up with recompilations
         # Returning 'cpu' to keep tensors on CPU in lm_eval code
         return "cpu"
 
-    def tok_encode(self, string):
-        return self.tokenizer.encode(string)
-
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-
-    def _model_generate(self, context, max_length, eos_token_id):
-        raise NotImplementedError()
-
-    def find_bucket(self, length):
+    def find_bucket(self, length: int) -> list[int]:
         return [b for b in self.buckets if b >= length][0]
 
-    def _model_call(self, inps):
+    def _model_call(self, inps: torch.Tensor) -> torch.Tensor:
         bs, seq_length = inps.shape
         padding_length = 0
         if self.options.static_shapes:
             bucket_length = self.find_bucket(seq_length)
             if self.options.use_cache and self.options.reuse_cache:
-                self.model.allocate_kv_cache(bs, bucket_length + 1, bucket_length)
+                self._model.allocate_kv_cache(bs, bucket_length + 1, bucket_length)
             padding_length = bucket_length - seq_length
-            inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id)
-        logits = self.model(inps.to(self._device), **self.model_inputs)["logits"].cpu()
+            inps = F.pad(inps, (0, padding_length), value=self._model.config.pad_token_id)
+        logits = self._model(inps.to(self.device_), **self.model_inputs)["logits"].cpu()
 
         if self.options.static_shapes and padding_length > 0:
             logits = logits[:, :-padding_length, :]
@@ -192,25 +215,22 @@ def _model_call(self, inps):
         return logits
 
 
-def main():
+def main() -> None:
+    # Modified based on cli_evaluate function in https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.7/lm_eval/__main__.py/#L268
     args = setup_lm_eval_parser()
     model, _, tokenizer, generation_config = initialize_model(args, logger)
-
     if args.trust_remote_code:
         # trust_remote_code fix was introduced in lm_eval 0.4.3
-        # https://github.com/EleutherAI/lm-evaluation-harness/pull/1998/files
-        # We need to cherry-pick the fix manually untill we upgrade (SW-190418)
         import datasets
 
         datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
 
-    lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
     with torch.no_grad():
         lm = HabanaModelAdapter(tokenizer, model, args, generation_config)
 
     eval_start = time.perf_counter()
     with torch.no_grad():
-        results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit_iters)
+        results = evaluator.simple_evaluate(lm, tasks=args.tasks, limit=args.limit_iters)
     if args.device == "hpu":
         import habana_frameworks.torch.hpu as torch_hpu
 
@@ -225,7 +245,13 @@ def main():
             mem = get_hpu_memory_stats()
             for k, v in mem.items():
                 print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-        json.dump(results, open(args.output_file, "w"), indent=2)
+
+        json_str = json.dumps(results, indent=2, default=utils.handle_non_serializable, ensure_ascii=False)
+        with open(args.output_file, "w", encoding="utf-8") as f:
+            f.write(json_str)
+        if args.show_config:
+            print(json_str)
+
     if args.quant_config:
         finalize_quantization(model)
 
diff --git a/examples/text-generation/text-generation-pipeline/README.md b/examples/text-generation/text-generation-pipeline/README.md
index a10792be2a..2aa036ec3a 100644
--- a/examples/text-generation/text-generation-pipeline/README.md
+++ b/examples/text-generation/text-generation-pipeline/README.md
@@ -22,7 +22,7 @@ The text-generation pipeline can be used to perform text-generation by providing
 
 If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 ```
 
 If you would like to use the pipeline with LangChain classes, you can install LangChain as follows:
diff --git a/examples/text-generation/text-generation-pipeline/run_pipeline.py b/examples/text-generation/text-generation-pipeline/run_pipeline.py
index 43aea65cec..11e542d7a5 100644
--- a/examples/text-generation/text-generation-pipeline/run_pipeline.py
+++ b/examples/text-generation/text-generation-pipeline/run_pipeline.py
@@ -45,14 +45,14 @@ def main():
 
     duration = 0
     for iteration in range(args.n_iterations):
-        logger.info(f"Running inference iteration {iteration+1}...")
+        logger.info(f"Running inference iteration {iteration + 1}...")
         t0 = time.perf_counter()
         output = pipe(input_sentences)
         duration += time.perf_counter() - t0
 
         for i, (input_sentence, generated_text) in enumerate(zip(input_sentences, output)):
-            print(f"Prompt[{iteration+1}][{i+1}]: {input_sentence}")
-            print(f"Generated Text[{iteration+1}][{i+1}]: {repr(generated_text)}\n")
+            print(f"Prompt[{iteration + 1}][{i + 1}]: {input_sentence}")
+            print(f"Generated Text[{iteration + 1}][{i + 1}]: {repr(generated_text)}\n")
 
     throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration
     print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds")
diff --git a/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py b/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py
index 556494cd37..6212e808aa 100644
--- a/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py
+++ b/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py
@@ -87,8 +87,8 @@ def main():
         duration += time.perf_counter() - t0
 
         for i, (question, answer) in enumerate(zip(input_questions, responses)):
-            print(f"Question[{iteration+1}][{i+1}]: {question['question']}")
-            print(f"Response[{iteration+1}][{i+1}]: {answer}\n")
+            print(f"Question[{iteration + 1}][{i + 1}]: {question['question']}")
+            print(f"Response[{iteration + 1}][{i + 1}]: {answer}\n")
 
     throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration
     print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds")
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index 637318b7be..5ef9c5c858 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -252,13 +252,14 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         model = AutoModelForCausalLM.from_pretrained(
             args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs
         )
-    elif args.load_quantized_model_with_inc:
-        # TODO: This will be removed in v1.19 Synapse release
-        # Override neural_compressor _load_remaining_pretrained_weight for the Transformer 4.45 release.
-        import neural_compressor.torch.algorithms.weight_only.save_load as nc_sl
-
-        nc_sl.WOQModelLoader._load_remaining_pretrained_weight = local_load_remaining_pretrained_weight
+    elif args.load_quantized_model_with_autoawq:
+        from transformers import AwqConfig
 
+        quantization_config = AwqConfig(bits=4, version="hpu")
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs
+        )
+    elif args.load_quantized_model_with_inc:
         from neural_compressor.torch.quantization import load
 
         model = load(model_name_or_path=args.model_name_or_path, format="huggingface", device="hpu", **model_kwargs)
@@ -277,9 +278,6 @@ def setup_model(args, model_dtype, model_kwargs, logger):
             original_model=org_model,
             **model_kwargs,
         )
-        # TODO: This will be removed in v1.19 Synapse release
-        # the loaded model should have the same dtype as original_model
-        model = model.to(model_kwargs["torch_dtype"])
     else:
         if args.assistant_model is not None:
             assistant_model = AutoModelForCausalLM.from_pretrained(
@@ -306,7 +304,8 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
             model = wrap_in_hpu_graph(model, hash_with_views=False)
         else:
-            model = wrap_in_hpu_graph(model)
+            max_graphs = getattr(args, "max_graphs", None)
+            model = wrap_in_hpu_graph(model, max_graphs=max_graphs)
         if args.assistant_model is not None:
             assistant_model = wrap_in_hpu_graph(assistant_model)
         if _is_peft_model(model):
@@ -442,7 +441,8 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
         # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
         with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
             if (
-                config.rope_scaling
+                hasattr(config, "rope_scaling")
+                and config.rope_scaling
                 and config.rope_scaling["rope_type"] == "llama3"
                 and config.max_position_embeddings > 8192
             ):
@@ -618,6 +618,12 @@ def setup_tokenizer(args, model, assistant_model, logger):
         )
         model.generation_config.eos_token_id = model.generation_config.eos_token_id[-1]
 
+    if model.config.model_type == "mpt":
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        if model.generation_config.pad_token_id is None:
+            model.generation_config.pad_token_id = tokenizer.eos_token_id
+
     # Some models like GPT2 do not have a PAD token so we have to set it if necessary
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
@@ -655,6 +661,7 @@ def setup_generation_config(args, model, assistant_model, tokenizer):
     generation_config.trim_logits = args.trim_logits
     generation_config.attn_softmax_bf16 = args.attn_softmax_bf16
     generation_config.limit_hpu_graphs = args.limit_hpu_graphs
+    generation_config.clear_hpu_graphs_cache = args.clear_hpu_graphs_cache
     generation_config.reuse_cache = args.reuse_cache
     generation_config.reduce_recompile = args.reduce_recompile
     if generation_config.reduce_recompile:
@@ -742,47 +749,3 @@ def initialize_model(args, logger):
     logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
     logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
     return model, assistant_model, tokenizer, generation_config
-
-
-# TODO:This will be removed from Synapse v1.19 release.
-# This is to override _load_remaining_pretrained_weight for Transformer 4.45 release.
-def local_load_remaining_pretrained_weight(self, model):
-    from transformers.modeling_utils import _load_state_dict_into_meta_model, load_state_dict
-
-    resolved_archive_file = self.kwargs.pop("resolved_archive_file", None)
-    torch_dtype = self.kwargs.pop("torch_dtype", torch.float32)
-    dtype_orig = self.kwargs.pop("dtype_orig", None)
-    offload_folder = self.kwargs.pop("offload_folder", None)
-    offload_state_dict = self.kwargs.pop("offload_state_dict", False)
-
-    # restore default dtype
-    if dtype_orig is not None:
-        torch.set_default_dtype(dtype_orig)
-
-    if not isinstance(resolved_archive_file, list):
-        resolved_archive_file = [resolved_archive_file]
-    for shard_file in resolved_archive_file:
-        state_dict = load_state_dict(shard_file)
-
-        params_dict = {
-            "model": model,
-            "state_dict": state_dict,
-            "start_prefix": "",
-            "expected_keys": self.loaded_state_dict_keys,
-            "device_map": {"": self.device},
-            "offload_folder": offload_folder,
-            "state_dict_folder": tempfile.mkdtemp() if offload_state_dict else None,
-            "state_dict_index": {} if offload_state_dict else None,
-            "dtype": torch_dtype,
-            "keep_in_fp32_modules": [],
-        }
-
-        _load_state_dict_into_meta_model(**params_dict)
-
-    # make sure token embedding weights are still tied if needed
-    model.tie_weights()
-
-    # Set model in evaluation mode to deactivate DropOut modules by default
-    model.eval()
-
-    return model
diff --git a/examples/text-to-speech/README.md b/examples/text-to-speech/README.md
index a1e089f55e..21070d275f 100644
--- a/examples/text-to-speech/README.md
+++ b/examples/text-to-speech/README.md
@@ -33,8 +33,4 @@ python3 run_pipeline.py \
     --text "Hello, my dog is cooler than you!" \
     --use_hpu_graphs \
     --bf16
-```
-Models that have been validated:
-  - [microsoft/speecht5_tts](https://huggingface.co/microsoft/speecht5_tts)
-  - [facebook/hf-seamless-m4t-medium](https://huggingface.co/facebook/hf-seamless-m4t-medium)
-  - [facebook/mms-tts-eng](https://huggingface.co/facebook/mms-tts-eng)
+```
\ No newline at end of file
diff --git a/examples/text-to-speech/run_pipeline.py b/examples/text-to-speech/run_pipeline.py
index 1d9b53de7d..81546b0cb9 100644
--- a/examples/text-to-speech/run_pipeline.py
+++ b/examples/text-to-speech/run_pipeline.py
@@ -129,7 +129,7 @@ def main():
                 text, batch_size=args.batch_size, forward_params=forward_params, generate_kwargs=generate_kwargs
             )
         end = time.time()
-        logger.info(f"speech = {speech} time = {(end-start) * 1000 / args.n_iterations }ms")
+        logger.info(f"speech = {speech} time = {(end - start) * 1000 / args.n_iterations}ms")
         sf.write("speech.wav", speech[0]["audio"].squeeze(), samplerate=speech[0]["sampling_rate"])
 
 
diff --git a/examples/text-to-video/requirements.txt b/examples/text-to-video/requirements.txt
deleted file mode 100644
index 6ab6d0d570..0000000000
--- a/examples/text-to-video/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-opencv-python-headless
diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py
index c2def132a7..6f55ae1350 100644
--- a/examples/translation/run_translation.py
+++ b/examples/translation/run_translation.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.45.0")
-check_optimum_habana_min_version("1.14.0.dev0")
+check_optimum_habana_min_version("1.16.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/trl/README.md b/examples/trl/README.md
index 750fc82b08..5e488e7072 100644
--- a/examples/trl/README.md
+++ b/examples/trl/README.md
@@ -39,14 +39,15 @@ $ pip install -U -r requirements.txt
         --lora_dropout=0.05 \
         --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
         --max_seq_length 512 \
-        --adam_epsilon 1e-08
+        --adam_epsilon 1e-08 \
+        --use_flash_attention
     ```
 
-2. Supervised fine-tuning of the mistralai/Mixtral-8x7B-v0.1 on 4 cards:
+2. Supervised fine-tuning of the mistralai/Mixtral-8x7B-Instruct-v0.1 on 4 cards:
 
     ```
     DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 4 --use_deepspeed sft.py \
-        --model_name_or_path mistralai/Mixtral-8x7B-v0.1 \
+        --model_name_or_path mistralai/Mixtral-8x7B-Instruct-v0.1 \
         --dataset_name "philschmid/dolly-15k-oai-style" \
         --subset 'data/' \
         --streaming False \
@@ -78,103 +79,10 @@ $ pip install -U -r requirements.txt
 
 ### Training
 
-#### For meta-llama/Llama-2-7b-hf
-
-The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model.
-There are two main steps to the DPO training process:
-1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
-
-    ```
-    python ../gaudi_spawn.py --world_size 8 --use_mpi sft.py \
-        --model_name_or_path meta-llama/Llama-2-7b-hf \
-        --dataset_name "lvwerra/stack-exchange-paired" \
-        --output_dir="./sft" \
-        --max_steps=500 \
-        --logging_steps=10 \
-        --save_steps=100 \
-        --do_train \
-        --per_device_train_batch_size=4 \
-        --per_device_eval_batch_size=1 \
-        --gradient_accumulation_steps=2 \
-        --learning_rate=1e-4 \
-        --lr_scheduler_type="cosine" \
-        --warmup_steps=100 \
-        --weight_decay=0.05 \
-        --optim="paged_adamw_32bit" \
-        --lora_target_modules "q_proj" "v_proj" \
-        --bf16 \
-        --remove_unused_columns=False \
-        --run_name="sft_llama2" \
-        --report_to=none \
-        --use_habana \
-        --use_lazy_mode
-    ```
-    To merge the adaptors to get the final sft merged checkpoint, we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-    ```
-    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="sft" --output_name="sft/final_merged_checkpoint"
-    ```
-
-2. Run the DPO trainer using the model saved by the previous step:
-    ```
-    python ../gaudi_spawn.py --world_size 8 --use_mpi dpo.py \
-        --model_name_or_path="sft/final_merged_checkpoint" \
-        --tokenizer_name_or_path=meta-llama/Llama-2-7b-hf \
-        --lora_target_modules "q_proj" "v_proj" "k_proj" "out_proj" "fc_in" "fc_out" "wte" \
-        --output_dir="dpo" \
-        --report_to=none
-    ```
-
-#### mistralai/Mistral-7B-v0.1
-
-1. Supervised fine-tuning of the base Mistral-7B-v0.1 model:
-
-    ```
-    DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py \
-        --model_name_or_path mistralai/Mistral-7B-v0.1 \
-        --dataset_name "lvwerra/stack-exchange-paired" \
-        --deepspeed ../language-modeling/llama2_ds_zero3_config.json \
-        --output_dir="./sft" \
-        --do_train \
-        --max_steps=500 \
-        --logging_steps=10 \
-        --save_steps=100 \
-        --per_device_train_batch_size=1 \
-        --per_device_eval_batch_size=1 \
-        --gradient_accumulation_steps=2 \
-        --learning_rate=1e-4 \
-        --lr_scheduler_type="cosine" \
-        --warmup_steps=100 \
-        --weight_decay=0.05 \
-        --optim="paged_adamw_32bit" \
-        --lora_target_modules "q_proj" "v_proj" \
-        --bf16 \
-        --remove_unused_columns=False \
-        --run_name="sft_mistral" \
-        --report_to=none \
-        --use_habana \
-        --use_lazy_mode
-    ```
-    To merge the adaptors to get the final sft merged checkpoint, we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-
-    ```
-    python merge_peft_adapter.py --base_model_name="mistralai/Mistral-7B-v0.1" --adapter_model_name="sft" --output_name="sft/final_merged_checkpoint"
-    ```
-
-2. Run the DPO trainer using the model saved by the previous step:
-    ```
-    DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed dpo.py \
-        --model_name_or_path="sft/final_merged_checkpoint" \
-        --tokenizer_name_or_path=mistralai/Mistral-7B-v0.1 \
-        --deepspeed ../language-modeling/llama2_ds_zero3_config.json \
-        --lora_target_modules "q_proj" "v_proj" "k_proj" "out_proj" "fc_in" "fc_out" "wte" \
-        --output_dir="dpo" \
-        --max_prompt_length=256 \
-        --max_length=512 \
-        --report_to=none
-    ```
-
 #### For meta-llama/Llama-2-70b-hf
 
+The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-70b model. There are two main steps to the DPO training process.
+
 For large model like Llama2-70B, we could use DeepSpeed Zero-3 to enable DPO training in multi-card.
 steps like:
 1. Supervised fine-tuning of the base llama-v2-70b model to create llama-v2-70b-se:
@@ -359,8 +267,12 @@ python ddpo.py \
   --use_hpu_graphs \
   --bf16 \
   --hf_hub_model_id="ddpo-finetuned-stable-diffusion" \
-  --push_to_hub False
+  --push_to_hub False \
+  --sdp_on_bf16
 ```
+> [!NOTE]
+> Due to a known issue on Gaudi3, sample_batch_sizes should be changed to 3. The issue will be fixed in the future release.
+
 
 2. Inference using the fine-tuned LoRA weights as shown in the example below:
 ```python
diff --git a/examples/trl/ddpo.py b/examples/trl/ddpo.py
index 46caf64c49..a2f1f15733 100644
--- a/examples/trl/ddpo.py
+++ b/examples/trl/ddpo.py
@@ -79,6 +79,9 @@ class ScriptArguments:
     push_to_hub: bool = field(default=False, metadata={"help": "Whether or not to push the model to the Hub."})
     use_habana: bool = field(default=True, metadata={"help": "Whether or not to use HPU."})
     use_hpu_graphs: bool = field(default=True, metadata={"help": "Whether or not to use hpu graphs."})
+    sdp_on_bf16: bool = field(
+        default=False, metadata={"help": "Allow pyTorch to use reduced precision in the SDPA math backend."}
+    )
 
 
 class MLP(nn.Module):
@@ -225,6 +228,7 @@ def image_outputs_logger(image_data, global_step, accelerate_logger):
         use_habana=args.use_habana,
         use_hpu_graphs=args.use_hpu_graphs,
         gaudi_config=gaudi_config,
+        sdp_on_bf16=args.sdp_on_bf16,
     )
 
     trainer = GaudiDDPOTrainer(
diff --git a/examples/video-classification/run_example.py b/examples/video-classification/run_example.py
index b593fb5955..2f78883742 100644
--- a/examples/video-classification/run_example.py
+++ b/examples/video-classification/run_example.py
@@ -80,7 +80,10 @@ def run(
     warm_up_epcohs: int,
     use_hpu_graphs: bool,
     cast_bf16: bool,
+    sdp_on_bf16: bool,
 ):
+    if sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
     processor = VideoMAEImageProcessor.from_pretrained(model_name)
     device = torch.device("hpu")
     model = VideoMAEForVideoClassification.from_pretrained(model_name)
@@ -152,6 +155,9 @@ def main():
         action="store_true",
         help="Whether to perform in bf16 precision.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
     parser.add_argument(
         "--log_level",
         default=None,
@@ -176,6 +182,7 @@ def main():
         args.warm_up_epochs,
         args.use_hpu_graphs,
         args.bf16,
+        args.sdp_on_bf16,
     )
 
 
diff --git a/examples/text-to-video/README.md b/examples/video-comprehension/README.md
similarity index 55%
rename from examples/text-to-video/README.md
rename to examples/video-comprehension/README.md
index 1df4e44e59..da54f26740 100644
--- a/examples/text-to-video/README.md
+++ b/examples/video-comprehension/README.md
@@ -1,12 +1,9 @@
 <!---
 Copyright 2024 The HuggingFace Team. All rights reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,28 +11,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Text to Video Examples
-
-This directory contains a script that showcases how to use the `GaudiTextToVideoSDPipeline` to run text-to-video generation tasks on HPUs.
-
-## Requirements
+#  Examples
 
-First, you should install the requirements:
-
-```bash
-pip install -r requirements.txt
-```
+This directory contains example scripts that demonstrate how to perform video comprehension on Gaudi with graph mode.
 
 ## Single-HPU inference
 
+### Video-LLaVA Model
+
 ```bash
-python3 text_to_video_generation.py \
-    --model_name_or_path ali-vilab/text-to-video-ms-1.7b \
-    --prompts "An astronaut riding a horse" \
-    --use_habana \
+python3 run_example.py \
+    --model_name_or_path "LanguageBind/Video-LLaVA-7B-hf" \
+    --warmup 3 \
+    --n_iterations 5 \
+    --batch_size 1 \
     --use_hpu_graphs \
-    --dtype bf16
+    --bf16 \
+    --output_dir ./
 ```
-
 Models that have been validated:
-  - [ali-vilab/text-to-video-ms-1.7b](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b)
+  - [LanguageBind/Video-LLaVA-7B-hf ](https://huggingface.co/LanguageBind/Video-LLaVA-7B-hf)
diff --git a/examples/video-comprehension/requirements.txt b/examples/video-comprehension/requirements.txt
new file mode 100644
index 0000000000..7ed65352d9
--- /dev/null
+++ b/examples/video-comprehension/requirements.txt
@@ -0,0 +1,2 @@
+av == 12.1.0
+sentencepiece == 0.2.0
diff --git a/examples/video-comprehension/run_example.py b/examples/video-comprehension/run_example.py
new file mode 100644
index 0000000000..5868bea3e8
--- /dev/null
+++ b/examples/video-comprehension/run_example.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import json
+import logging
+import os
+import time
+from pathlib import Path
+
+import av
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import VideoLlavaProcessor
+
+from optimum.habana.transformers.modeling_utils import (
+    GaudiVideoLlavaForConditionalGeneration,
+    adapt_transformers_to_gaudi,
+)
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        help="Path to pre-trained model",
+    )
+    parser.add_argument(
+        "--video_path",
+        default=None,
+        type=str,
+        nargs="*",
+        help='Path to video as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --video_path "URL1" "URL2")',
+    )
+    parser.add_argument(
+        "--prompt",
+        default=None,
+        type=str,
+        help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")',
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to perform generation in bf16 precision.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        help="Output directory to store results in.",
+    )
+    parser.add_argument(
+        "--token",
+        default=None,
+        type=str,
+        help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+        "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
+    parser.add_argument(
+        "--ignore_eos",
+        action="store_true",
+        help="Whether to disable stopping with eos token when calling `generate`.",
+    )
+    parser.add_argument(
+        "--use_flash_attention",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention, provided that the model supports it.",
+    )
+    parser.add_argument(
+        "--flash_attention_recompute",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
+    )
+
+    args = parser.parse_args()
+
+    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+
+    if args.video_path is None:
+        args.video_path = [
+            hf_hub_download(
+                repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+            )
+        ]
+
+    if args.prompt is None:
+        args.prompt = ["USER: <video>Why is this video funny? ASSISTANT:"]
+    video_paths = args.video_path
+    video_paths_len = len(video_paths)
+
+    prompts = args.prompt
+    if args.batch_size > video_paths_len:
+        # Dynamically extends to support larger batch sizes
+        num_path_to_add = args.batch_size - video_paths_len
+        for i in range(num_path_to_add):
+            video_paths.append(video_paths[i % video_paths_len])
+            prompts.append(prompts[i % video_paths_len])
+    elif args.batch_size < video_paths_len:
+        video_paths = video_paths[: args.batch_size]
+
+    video_clips = []
+
+    for video_path in video_paths:
+        container = av.open(video_path)
+        num_frames = container.streams.video[0].frames
+        indices = np.arange(0, num_frames, num_frames / 8).astype(int)
+        clip = read_video_pyav(container, indices)
+        video_clips.append(clip)
+
+    if args.bf16:
+        model_dtype = torch.bfloat16
+    else:
+        model_dtype = torch.float32
+
+    adapt_transformers_to_gaudi()
+    model = GaudiVideoLlavaForConditionalGeneration.from_pretrained(args.model_name_or_path)
+    model = model.to(model_dtype)
+    device = torch.device("hpu")
+    model = model.to(device)
+    if args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        model = wrap_in_hpu_graph(model)
+
+    processor = VideoLlavaProcessor.from_pretrained(args.model_name_or_path)
+    processor.tokenizer.padding_side = "left"
+    inputs = processor(text=prompts, videos=video_clips, return_tensors="pt")
+    inputs = inputs.to(device)
+
+    # warm up
+    for i in range(args.warmup):
+        generate_ids = model.generate(
+            **inputs,
+            lazy_mode=True,
+            hpu_graphs=args.use_hpu_graphs,
+            max_new_tokens=args.max_new_tokens,
+            ignore_eos=args.ignore_eos,
+            use_flash_attention=args.use_flash_attention,
+            flash_attention_recompute=args.flash_attention_recompute,
+        )
+    torch.hpu.synchronize()
+
+    start = time.perf_counter()
+    for i in range(args.n_iterations):
+        generate_ids = model.generate(
+            **inputs,
+            lazy_mode=True,
+            hpu_graphs=args.use_hpu_graphs,
+            max_new_tokens=args.max_new_tokens,
+            ignore_eos=args.ignore_eos,
+            use_flash_attention=args.use_flash_attention,
+            flash_attention_recompute=args.flash_attention_recompute,
+        )
+        generate_texts = processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+    end = time.perf_counter()
+    duration = end - start
+
+    # Let's calculate the number of generated tokens
+    n_input_tokens = inputs["input_ids"].shape[1]
+    n_output_tokens = 0
+    for i in range(generate_ids.shape[0]):
+        n_input_tokens = torch.sum(inputs["attention_mask"][i, :]).item()
+        # We have to subtract the number of input tokens as they are part of the returned sequence
+        n_output_tokens += len(generate_ids[i]) - n_input_tokens
+
+    total_new_tokens_generated = args.n_iterations * n_output_tokens
+    throughput = total_new_tokens_generated / duration
+    logger.info(f"result = {generate_texts}")
+    logger.info(
+        f"time = {(end - start) * 1000 / args.n_iterations}ms, Throughput (including tokenization) = {throughput} tokens/second"
+    )
+
+    # Store results if necessary
+    if args.output_dir is not None:
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        results = {
+            "throughput": throughput,
+            "output": generate_texts,
+        }
+        with (output_dir / "results.json").open("w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md
index c2b725bd6c..36f81e481b 100644
--- a/examples/visual-question-answering/README.md
+++ b/examples/visual-question-answering/README.md
@@ -28,12 +28,6 @@ python3 run_pipeline.py \
     --use_hpu_graphs \
     --bf16
 ```
-
-Models that have been validated:
-  - [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base)
-  - [dandelin/vilt-b32-finetuned-vqa](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
-  - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large)
-
 ## OpenCLIP inference
 
 The `run_openclip_vqa.py` can be used to run zero shot image classification with [OpenCLIP Huggingface Models](https://huggingface.co/docs/hub/en/open_clip#using-openclip-at-hugging-face).
@@ -43,15 +37,7 @@ The requirements for `run_openclip_vqa.py` can be installed with `openclip_requi
 pip install -r openclip_requirements.txt
 ```
 
-By default, the script runs the sample outlined in [BiomedCLIP-PubMedBERT_256-vit_base_patch16_224 notebook](https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224/blob/main/biomed_clip_example.ipynb) which can be run as follows:
-
-```bash
-python run_openclip_vqa.py \
-    --use_hpu_graphs \
-    --bf16
-```
-
-One can also run other OpenCLIP models by specifying model, classifier labels and image URL(s) like so:
+By default, the script runs the sample outlined in [BiomedCLIP-PubMedBERT_256-vit_base_patch16_224 notebook](https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224/blob/main/biomed_clip_example.ipynb). One can also can also run other OpenCLIP models by specifying model, classifier labels and image URL(s) like so:
 
 ```bash
 python run_openclip_vqa.py \
@@ -60,9 +46,4 @@ python run_openclip_vqa.py \
     --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" \
     --use_hpu_graphs \
     --bf16
-```
-
-Models that have been validated:
-  - [microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224](https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224)
-  - [laion/CLIP-ViT-g-14-laion2B-s12B-b42K](https://huggingface.co/laion/CLIP-ViT-g-14-laion2B-s12B-b42K)
-  - [apple/DFN5B-CLIP-ViT-H-14](https://huggingface.co/apple/DFN5B-CLIP-ViT-H-14/tree/main)
+```
\ No newline at end of file
diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py
index 7b4e817bb7..82b05933bc 100644
--- a/examples/visual-question-answering/run_pipeline.py
+++ b/examples/visual-question-answering/run_pipeline.py
@@ -135,7 +135,7 @@ def main():
         with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
             result = generator(model_input, batch_size=args.batch_size, topk=args.topk)
     end = time.time()
-    logger.info(f"result = {result}, time = {(end-start) * 1000/args.n_iterations}ms")
+    logger.info(f"result = {result}, time = {(end - start) * 1000 / args.n_iterations}ms")
 
 
 if __name__ == "__main__":
diff --git a/notebooks/AI_HW_Summit_2022.ipynb b/notebooks/AI_HW_Summit_2022.ipynb
index 2b9bf711b8..4ebb252cf3 100644
--- a/notebooks/AI_HW_Summit_2022.ipynb
+++ b/notebooks/AI_HW_Summit_2022.ipynb
@@ -262,7 +262,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0"
+    "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0"
    ]
   },
   {
diff --git a/optimum/habana/AutoAWQ/gemm_hpu.py b/optimum/habana/AutoAWQ/gemm_hpu.py
new file mode 100644
index 0000000000..e60e4e9ee6
--- /dev/null
+++ b/optimum/habana/AutoAWQ/gemm_hpu.py
@@ -0,0 +1,153 @@
+import torch
+import torch.nn as nn
+from awq.modules.linear.gemm import WQLinear_GEMM
+from awq.utils.packing_utils import reverse_awq_order, unpack_awq
+
+
+try:
+    import habana_frameworks.torch.hpu  # noqa: F401
+
+    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
+except Exception as e:
+    hpu_import_exception = e
+
+    def error_raiser_hpu(*args, **kwargs):
+        raise ValueError(
+            f"Trying to use HPU, but could not import the HPU framework with the following error: {hpu_import_exception}"
+        )
+
+    convert_from_uint4 = error_raiser_hpu
+
+
+def unpack_weight_and_zeros(qweight, qzeros, bits):
+    # Unpack the qweight and qzeros tensors
+    iweight, izeros = unpack_awq(qweight, qzeros, bits)
+    # Reverse the order of the iweight and izeros tensors
+    iweight, izeros = reverse_awq_order(iweight, izeros, bits)
+
+    # overflow checks
+    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
+    izeros = torch.bitwise_and(izeros, (2**bits) - 1)
+
+    return iweight, izeros
+
+
+def pack_tensor(input, bits=4):
+    normal = input.to(torch.int32)
+    q = torch.zeros(
+        (normal.shape[0], normal.shape[1] // 32 * bits),
+        dtype=torch.int32,
+        device=input.device,
+    )
+    i = 0
+    col = 0
+    while col < q.shape[1]:
+        for j in range(i, i + (32 // bits)):
+            q[:, col] |= normal[:, j] << (bits * (j - i))
+        i += 32 // bits
+        col += 1
+    q = q.to(torch.int32)
+    return q
+
+
+class WQLinear_HPU(WQLinear_GEMM):
+    def __init__(self, w_bit, group_size, in_features, out_features, bias, dev, training=False):
+        nn.Module.__init__(self)
+        assert w_bit == 4, "Only 4 bit are supported for now."
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else in_features
+        self.scale_dtype = torch.float32
+        self.training = training
+
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert out_features % (32 // self.w_bit) == 0
+        self.pack_num = 32 // self.w_bit
+
+        self.init_ipex = False
+
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (in_features // self.group_size, out_features // self.pack_num),
+                dtype=torch.int32,
+                device=dev,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (in_features // self.group_size, out_features),
+                dtype=torch.bfloat16,
+                device=dev,
+            ),
+        )
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.zeros((out_features), dtype=torch.bfloat16, device=dev),
+            )
+        else:
+            self.bias = None
+        self.register_buffer(
+            "qweight",
+            torch.zeros((in_features, out_features // self.pack_num), dtype=torch.int32, device=dev),
+        )
+        self._preprocess = False
+
+    def _preprocessing(self):
+        device = self.qweight.device
+        weight, zeros = unpack_weight_and_zeros(self.qweight.cpu(), self.qzeros.cpu(), self.w_bit)
+        self.qweight = pack_tensor(weight).to(device)
+        self.qzeros = pack_tensor(zeros).to(device)
+        self._preprocess = True
+
+    def post_init(self):
+        self._preprocessing()
+
+    @classmethod
+    def from_linear(cls, linear, w_bit, group_size, init_only=False, scales=None, zeros=None):
+        awq_linear = cls(
+            w_bit,
+            group_size,
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            linear.weight.device,
+        )
+        if init_only:  # just prepare for loading sd
+            return awq_linear
+        raise NotImplementedError("Only inference is supported for HPU kernels")
+
+    def forward(self, x):
+        assert self._preprocess is True, (
+            "module.post_init() must be called before module.forward(). Use hpu_post_init() on the whole model."
+        )
+        out_shape = x.shape[:-1] + (self.out_features,)
+        x = x.reshape(-1, x.shape[-1])
+        weights = convert_from_uint4(self.qweight, self.scales, self.qzeros, x.dtype)
+        outputs = torch.matmul(x, weights)
+
+        outputs = outputs + self.bias if self.bias is not None else outputs
+        outputs = outputs.reshape(out_shape)
+
+        return outputs
+
+    def extra_repr(self) -> str:
+        return "in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.w_bit,
+            self.group_size,
+        )
+
+
+def hpu_post_init(model):
+    for _, submodule in model.named_modules():
+        if isinstance(submodule, WQLinear_HPU):
+            submodule.post_init()
+
+    return model
diff --git a/optimum/habana/accelerate/accelerator.py b/optimum/habana/accelerate/accelerator.py
index e5fb539a5b..8566c9a7e5 100644
--- a/optimum/habana/accelerate/accelerator.py
+++ b/optimum/habana/accelerate/accelerator.py
@@ -59,7 +59,7 @@
 from accelerate.utils.other import is_compiled_module
 from torch.optim.lr_scheduler import LRScheduler
 
-from .. import parallel_state
+from ..distributed import parallel_state
 
 
 if is_deepspeed_available():
@@ -123,6 +123,7 @@ def __init__(
         dynamic: bool | None = None,
         distribution_strategy: str = None,
         force_autocast: bool = False,
+        use_regional_compilation: bool | None = None,
     ):
         self.trackers = []
         self.mpu = parallel_state
@@ -156,7 +157,7 @@ def __init__(
         if deepspeed_plugin:
             if not is_deepspeed_available():
                 raise ImportError(
-                    "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
+                    "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0`."
                 )
 
             mixed_precision = (
@@ -197,9 +198,9 @@ def __init__(
 
         if kwargs_handlers is not None:
             for handler in kwargs_handlers:
-                assert isinstance(
-                    handler, KwargsHandler
-                ), f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
+                assert isinstance(handler, KwargsHandler), (
+                    f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
+                )
                 if isinstance(handler, DistributedDataParallelKwargs):
                     if self.ddp_handler is not None:
                         raise ValueError("You can only pass one `DistributedDataParallelKwargs` in `kwargs_handler`.")
@@ -315,6 +316,7 @@ def __init__(
             )
         self.step_scheduler_with_optimizer = step_scheduler_with_optimizer
         self.dynamic = dynamic
+        self.use_regional_compilation = use_regional_compilation
 
         # Mixed precision attributes
         self.scaler = None
@@ -407,7 +409,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 model.forward = convert_outputs_to_fp32(new_forward)
 
         if self.state.is_fp8_enabled:
-            model = convert_model(model)
+            model = convert_model(model, _minimize_memory=GaudiPartialState().minimize_memory)
 
         if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
             model, "hf_device_map", False
@@ -577,6 +579,19 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 model = torch.compile(model, **self.state.dynamo_plugin.to_kwargs())
         return model
 
+    def compile_regions(self, model):
+        if isinstance(model, torch.nn.ModuleList):
+            for name, module in model.named_children():
+                if self.dynamic is not None:
+                    module = torch.compile(module, dynamic=self.dynamic, **self.state.dynamo_plugin.to_kwargs())
+                else:
+                    module = torch.compile(module, **self.state.dynamo_plugin.to_kwargs())
+                module.__dict__.pop("_parameters", None)
+                setattr(model, name, module)
+        else:
+            for _, module in model.named_children():
+                self.compile_regions(module)
+
     def _prepare_deepspeed(self, *args):
         import deepspeed
 
@@ -586,7 +601,7 @@ def _prepare_deepspeed(self, *args):
         result = [
             self._prepare_one(obj, first_pass=True)
             if isinstance(obj, torch.utils.data.DataLoader)
-            else convert_model(obj)
+            else convert_model(obj, _minimize_memory=GaudiPartialState().minimize_memory)
             if isinstance(obj, torch.nn.Module) and self.state.is_fp8_enabled
             else obj
             for obj in args
@@ -783,7 +798,10 @@ def _prepare_deepspeed(self, *args):
             if self.state.dynamo_plugin.backend == GaudiDynamoBackend.HPU_BACKEND and not is_compiled_module(
                 kwargs["model"]
             ):
-                engine.compile(compile_kwargs={"dynamic": self.dynamic})
+                if self.use_regional_compilation:
+                    self.compile_regions(engine.module)
+                else:
+                    engine.compile(compile_kwargs={"dynamic": self.dynamic})
             if optimizer is not None:
                 optimizer = DeepSpeedOptimizerWrapper(optimizer)
             if scheduler is not None:
diff --git a/optimum/habana/accelerate/data_loader.py b/optimum/habana/accelerate/data_loader.py
index afe8fc1cd8..bab31edb0e 100644
--- a/optimum/habana/accelerate/data_loader.py
+++ b/optimum/habana/accelerate/data_loader.py
@@ -22,7 +22,7 @@
 )
 from torch.utils.data import BatchSampler, DataLoader, IterableDataset
 
-from .. import parallel_state
+from ..distributed import parallel_state
 from .state import GaudiAcceleratorState
 from .utils.operations import (
     broadcast,
diff --git a/optimum/habana/accelerate/state.py b/optimum/habana/accelerate/state.py
index b9c4e794f7..c5d241e384 100644
--- a/optimum/habana/accelerate/state.py
+++ b/optimum/habana/accelerate/state.py
@@ -21,7 +21,7 @@
 
 from optimum.utils import logging
 
-from .. import parallel_state
+from ..distributed import parallel_state
 from .utils import GaudiDistributedType
 
 
@@ -52,11 +52,12 @@ def __init__(self, cpu: bool = False, **kwargs):
                 world_size, rank, local_rank = initialize_distributed_hpu()
                 self.backend = kwargs.pop("backend", "hccl")
                 context_parallel_size = kwargs.pop("context_parallel_size", 1)
+                self.minimize_memory = kwargs.pop("minimize_memory", False)
                 if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
                     if not is_deepspeed_available():
                         raise ImportError(
                             "DeepSpeed is not available, install it with: `pip install"
-                            " git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
+                            " git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0`."
                         )
                     self.distributed_type = GaudiDistributedType.DEEPSPEED
                     import deepspeed
@@ -88,7 +89,17 @@ def __init__(self, cpu: bool = False, **kwargs):
                     self.device = torch.device("hpu")
                 if not is_deepspeed_available():
                     context_parallel_size = 1
-                parallel_state.initialize_model_parallel(sequence_parallel_size=context_parallel_size, use_fp8=False)
+                if parallel_state.is_unitialized():
+                    parallel_state.initialize_model_parallel(
+                        sequence_parallel_size=context_parallel_size, use_fp8=False
+                    )
+                else:
+                    if parallel_state.get_sequence_parallel_world_size() != context_parallel_size:
+                        raise ValueError(
+                            "The initialized sequence parallel world size does not match the context parallel size."
+                        )
+                    if parallel_state.amax_reduction_is_initialized():
+                        logger.info("FP8 amax reduction group is already initialized.")
             else:
                 self.distributed_type = (
                     GaudiDistributedType.NO
diff --git a/optimum/habana/accelerate/utils/modeling.py b/optimum/habana/accelerate/utils/modeling.py
new file mode 100644
index 0000000000..2dbbdb951e
--- /dev/null
+++ b/optimum/habana/accelerate/utils/modeling.py
@@ -0,0 +1,52 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Device similarity check compatible with hpu
+"""
+
+import torch
+
+
+def gaudi_check_device_same(first_device, second_device):
+    """
+    Copied from https://github.com/huggingface/accelerate/blob/6b2d968897c91bc3f96274b4679d84e9950ad908/src/accelerate/utils/modeling.py#L50
+    difference is addition of HPU device checks
+
+    Args:
+        first_device (`torch.device`):
+            First device to check
+        second_device (`torch.device`):
+            Second device to check
+    """
+    if first_device.type != second_device.type:
+        return False
+
+    if first_device.type == "cuda" and first_device.index is None:
+        # In case the first_device is a cuda device and have
+        # the index attribute set to `None`, default it to `0`
+        first_device = torch.device("cuda", index=0)
+
+    elif first_device.type == "hpu" and first_device.index is None:
+        first_device = torch.device("hpu", index=0)
+
+    if second_device.type == "cuda" and second_device.index is None:
+        # In case the second_device is a cuda device and have
+        # the index attribute set to `None`, default it to `0`
+        second_device = torch.device("cuda", index=0)
+
+    elif second_device.type == "hpu" and second_device.index is None:
+        second_device = torch.device("hpu", index=0)
+
+    return first_device == second_device
diff --git a/optimum/habana/accelerate/utils/transformer_engine.py b/optimum/habana/accelerate/utils/transformer_engine.py
index 03f070434c..23c4a75cff 100755
--- a/optimum/habana/accelerate/utils/transformer_engine.py
+++ b/optimum/habana/accelerate/utils/transformer_engine.py
@@ -38,7 +38,7 @@ def is_fp8_available():
     return has_transformer_engine
 
 
-def _convert_model(model, to_transformer_engine=True, _convert_linear=True):
+def _convert_model(model, to_transformer_engine=True, _convert_linear=True, _minimize_memory=False):
     """
     Recursively converts the linear layer of a model to their `transformers_engine` counterpart.
     """
@@ -56,6 +56,7 @@ def _convert_model(model, to_transformer_engine=True, _convert_linear=True):
                 bias=has_bias,
                 params_dtype=module.weight.dtype,
                 skip_weight_param_allocation=True,
+                minimize_memory=_minimize_memory,
             )
             te_module.weight = module.weight
 
@@ -109,7 +110,12 @@ def forward(
 
             setattr(model, name, TE_ModuleFusedSDPA())
         else:
-            _convert_model(module, to_transformer_engine=to_transformer_engine, _convert_linear=_convert_linear)
+            _convert_model(
+                module,
+                to_transformer_engine=to_transformer_engine,
+                _convert_linear=_convert_linear,
+                _minimize_memory=_minimize_memory,
+            )
 
 
 def has_transformer_engine_layers(model):
@@ -124,14 +130,14 @@ def has_transformer_engine_layers(model):
     return False
 
 
-def convert_model(model):
+def convert_model(model, _minimize_memory=False):
     """
     Converts torch.nn.Linear modules to `transformers_engine` Linear modules.
     Adapted from: https://github.com/huggingface/accelerate/blob/v0.27.2/src/accelerate/accelerator.py#L1303
     """
     if not has_transformer_engine_layers(model):
         with torch.no_grad():
-            _convert_model(model)
+            _convert_model(model, _minimize_memory=_minimize_memory)
         model._converted_to_transformer_engine = True
     return model
 
diff --git a/optimum/habana/diffusers/models/unet_2d_condition.py b/optimum/habana/diffusers/models/unet_2d_condition.py
index 7bb641bbf1..1218c0fc65 100644
--- a/optimum/habana/diffusers/models/unet_2d_condition.py
+++ b/optimum/habana/diffusers/models/unet_2d_condition.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
@@ -5,7 +6,12 @@
 from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
 from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, torch_utils, unscale_lora_layers
 
-from optimum.habana.diffusers.utils.torch_utils import gaudi_fourier_filter
+from ..utils.torch_utils import gaudi_fourier_filter
+from .attention_processor import (
+    AttentionProcessor,
+    AttnProcessor2_0,
+    ScaledDotProductAttention,
+)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -357,3 +363,50 @@ def gaudi_unet_2d_condition_model_forward(
         return (sample,)
 
     return UNet2DConditionOutput(sample=sample)
+
+
+def set_attn_processor_hpu(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    """
+    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    Added env PATCH_SDPA for HPU specific handle to use ScaledDotProductAttention.
+    Sets the attention processor to use to compute attention.
+    Parameters:
+        processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+            The instantiated processor class or a dictionary of processor classes that will be set as the processor
+            for **all** `Attention` layers.
+            If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+            processor. This is strongly recommended when setting trainable attention processors.
+    """
+    count = len(self.attn_processors.keys())
+    if isinstance(processor, dict) and len(processor) != count:
+        raise ValueError(
+            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+        )
+
+    def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+        if hasattr(module, "set_processor"):
+            if os.environ.get("PATCH_SDPA") is not None:
+                setattr(module, "attention_module", ScaledDotProductAttention())
+                module.set_processor(processor(module.attention_module))
+            else:
+                if isinstance(processor, dict):
+                    attention_processor = processor.pop(f"{name}.processor", None)
+                    if attention_processor is not None:
+                        module.set_processor(attention_processor)
+                else:
+                    module.set_processor(processor)
+        for sub_name, child in module.named_children():
+            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+    for name, module in self.named_children():
+        fn_recursive_attn_processor(name, module, processor)
+
+
+def set_default_attn_processor_hpu(self):
+    """
+    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    Disables custom attention processors and sets the default attention implementation from HPU.
+    """
+    processor = AttnProcessor2_0()
+    set_attn_processor_hpu(self, processor)
diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
index b4566a0241..c274ab0f00 100644
--- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -82,6 +82,8 @@ class GaudiStableDiffusionControlNetPipeline(GaudiDiffusionPipeline, StableDiffu
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -100,6 +102,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -107,6 +110,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusionControlNetPipeline.__init__(
diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_stable_video_diffusion_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_stable_video_diffusion_controlnet.py
index b3419a8aea..50387fedb8 100644
--- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_stable_video_diffusion_controlnet.py
+++ b/optimum/habana/diffusers/pipelines/controlnet/pipeline_stable_video_diffusion_controlnet.py
@@ -63,6 +63,18 @@ class GaudiStableVideoDiffusionControlNetPipeline(GaudiStableVideoDiffusionPipel
             A scheduler to be used in combination with `unet` to denoise the encoded image latents.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images.
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -77,6 +89,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -84,6 +97,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableVideoDiffusionPipeline.__init__(
diff --git a/optimum/habana/diffusers/pipelines/ddpm/pipeline_ddpm.py b/optimum/habana/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 65a7df7e2d..564188500f 100644
--- a/optimum/habana/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/optimum/habana/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -59,6 +59,18 @@ class GaudiDDPMPipeline(GaudiDiffusionPipeline, DDPMPipeline):
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
             [`DDPMScheduler`], or [`DDIMScheduler`].
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -69,8 +81,16 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
-        GaudiDiffusionPipeline.__init__(self, use_habana, use_hpu_graphs, gaudi_config, bf16_full_eval)
+        GaudiDiffusionPipeline.__init__(
+            self,
+            use_habana,
+            use_hpu_graphs,
+            gaudi_config,
+            bf16_full_eval,
+            sdp_on_bf16,
+        )
 
         DDPMPipeline.__init__(self, unet, scheduler)
 
diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
index f8afeaa559..5657999cda 100644
--- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
+++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
@@ -282,6 +282,18 @@ class GaudiFluxPipeline(GaudiDiffusionPipeline, FluxPipeline):
         tokenizer_2 (`T5TokenizerFast`):
             Second Tokenizer of class
             [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
@@ -301,6 +313,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -308,6 +321,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
         FluxPipeline.__init__(
             self,
@@ -367,10 +381,11 @@ def _split_inputs_into_batches(cls, batch_size, latents, prompt_embeds, pooled_p
 
             # Pad guidance if necessary
             if guidance is not None:
+                guidance_batches[-1] = guidance_batches[-1].unsqueeze(1)
                 sequence_to_stack = (guidance_batches[-1],) + tuple(
                     torch.zeros_like(guidance_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
                 )
-                guidance_batches[-1] = torch.vstack(sequence_to_stack)
+                guidance_batches[-1] = torch.vstack(sequence_to_stack).squeeze(1)
 
         # Stack batches in the same tensor
         latents_batches = torch.stack(latents_batches)
diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
index fda7e5ef7e..17894db5ae 100644
--- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
+++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
@@ -297,6 +297,8 @@ class GaudiFluxImg2ImgPipeline(GaudiDiffusionPipeline, FluxImg2ImgPipeline):
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
@@ -316,6 +318,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -323,6 +326,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
         FluxImg2ImgPipeline.__init__(
             self,
@@ -382,10 +386,11 @@ def _split_inputs_into_batches(cls, batch_size, latents, prompt_embeds, pooled_p
 
             # Pad guidance if necessary
             if guidance is not None:
+                guidance_batches[-1] = guidance_batches[-1].unsqueeze(1)
                 sequence_to_stack = (guidance_batches[-1],) + tuple(
                     torch.zeros_like(guidance_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
                 )
-                guidance_batches[-1] = torch.vstack(sequence_to_stack)
+                guidance_batches[-1] = torch.vstack(sequence_to_stack).squeeze(1)
 
         # Stack batches in the same tensor
         latents_batches = torch.stack(latents_batches)
@@ -623,14 +628,14 @@ def __call__(
                 f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
                 f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
             )
-        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        latent_timestep = timesteps[:1].repeat(num_prompts * num_images_per_prompt)
 
         # 6. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(
             init_image,
             latent_timestep,
-            batch_size * num_images_per_prompt,
+            num_prompts * num_images_per_prompt,
             num_channels_latents,
             height,
             width,
diff --git a/optimum/habana/diffusers/pipelines/pipeline_utils.py b/optimum/habana/diffusers/pipelines/pipeline_utils.py
index 2cbfd57d2b..5215fd6603 100644
--- a/optimum/habana/diffusers/pipelines/pipeline_utils.py
+++ b/optimum/habana/diffusers/pipelines/pipeline_utils.py
@@ -28,10 +28,10 @@
 from diffusers.utils.torch_utils import is_compiled_module
 from huggingface_hub import create_repo
 
-from optimum.habana.utils import to_device_dtype
 from optimum.utils import logging
 
 from ...transformers.gaudi_configuration import GaudiConfig
+from ...utils import to_device_dtype
 
 
 logger = logging.get_logger(__name__)
@@ -128,7 +128,8 @@ def __init__(
         DiffusionPipeline.__init__(self)
 
         if sdp_on_bf16:
-            torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+            if hasattr(torch._C, "_set_math_sdp_allow_fp16_bf16_reduction"):
+                torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
         self.use_habana = use_habana
         if self.use_habana:
@@ -165,8 +166,6 @@ def __init__(
                         "`torch_dtype=torch.bfloat16` was given. Disabling mixed precision and continuing in bf16 only."
                     )
                     self.gaudi_config.use_torch_autocast = False
-                else:
-                    self.gaudi_config.declare_autocast_bf16_fp32_ops()
 
             # Workaround for Synapse 1.11 for full bf16 and Torch Autocast
             if bf16_full_eval or self.gaudi_config.use_torch_autocast:
@@ -370,6 +369,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         bf16_full_eval = kwargs.get("torch_dtype", None) == torch.bfloat16
         kwargs["bf16_full_eval"] = bf16_full_eval
 
+        # Need to load custom ops lists before instantiating htcore
+        if kwargs.get("gaudi_config", None) is not None:
+            if isinstance(kwargs["gaudi_config"], str):
+                gaudi_config = GaudiConfig.from_pretrained(kwargs["gaudi_config"])
+            else:
+                gaudi_config = kwargs["gaudi_config"]
+            gaudi_config.declare_autocast_bf16_fp32_ops()
+            kwargs["gaudi_config"] = gaudi_config
+
+        # Import htcore here to support model quantization
+        import habana_frameworks.torch.core as htcore  # noqa: F401
+
         return super().from_pretrained(
             pretrained_model_name_or_path,
             **kwargs,
@@ -396,7 +407,8 @@ def save_lora_weights(
             text_encoder_2_lora_layers = to_device_dtype(text_encoder_2_lora_layers, target_device=torch.device("cpu"))
 
         # text_encoder_2_lora_layers is only supported by some diffuser pipelines
-        if text_encoder_2_lora_layers:
+        signature = inspect.signature(super().save_lora_weights)
+        if "text_encoder_2_lora_layers" in signature.parameters:
             return super().save_lora_weights(
                 save_directory,
                 unet_lora_layers,
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 630bc9c18b..7efe1059bc 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-import os
 import time
 from dataclasses import dataclass
 from math import ceil
@@ -30,15 +29,11 @@
 from diffusers.utils import BaseOutput, deprecate
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
-from optimum.habana.diffusers.models.attention_processor import (
-    AttentionProcessor,
-    AttnProcessor2_0,
-    ScaledDotProductAttention,
-)
 from optimum.utils import logging
 
 from ....transformers.gaudi_configuration import GaudiConfig
 from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
+from ...models.unet_2d_condition import set_default_attn_processor_hpu
 from ..pipeline_utils import GaudiDiffusionPipeline
 
 
@@ -101,59 +96,6 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-def set_attn_processor_hpu(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-    """
-    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    Added env PATCH_SDPA for HPU specific handle to use ScaledDotProductAttention.
-    Sets the attention processor to use to compute attention.
-    Parameters:
-        processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-            The instantiated processor class or a dictionary of processor classes that will be set as the processor
-            for **all** `Attention` layers.
-
-            If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-            processor. This is strongly recommended when setting trainable attention processors.
-
-    """
-
-    count = len(self.attn_processors.keys())
-
-    if isinstance(processor, dict) and len(processor) != count:
-        raise ValueError(
-            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-        )
-
-    def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-        if hasattr(module, "set_processor"):
-            if os.environ.get("PATCH_SDPA") is not None:
-                setattr(module, "attention_module", ScaledDotProductAttention())
-                module.set_processor(processor(module.attention_module))
-            else:
-                if isinstance(processor, dict):
-                    attention_processor = processor.pop(f"{name}.processor", None)
-                    if attention_processor is not None:
-                        module.set_processor(attention_processor)
-                else:
-                    module.set_processor(processor)
-
-        for sub_name, child in module.named_children():
-            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-    for name, module in self.named_children():
-        fn_recursive_attn_processor(name, module, processor)
-
-
-def set_default_attn_processor_hpu(self):
-    """
-    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    Disables custom attention processors and sets the default attention implementation from HPU.
-    """
-
-    processor = AttnProcessor2_0()
-    set_attn_processor_hpu(self, processor)
-
-
 class GaudiStableDiffusionPipeline(GaudiDiffusionPipeline, StableDiffusionPipeline):
     """
     Adapted from: https://github.com/huggingface/diffusers/blob/v0.23.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L73
@@ -656,13 +598,18 @@ def __call__(
             hb_profiler.stop()
 
             speed_metrics_prefix = "generation"
+            if t1 == t0 or use_warmup_inference_steps:
+                num_samples = num_batches * batch_size
+                num_steps = num_inference_steps * num_batches * batch_size
+            else:
+                num_samples = (num_batches - throughput_warmup_steps) * batch_size
+                num_steps = (num_batches - throughput_warmup_steps) * num_inference_steps * batch_size
+
             speed_measures = speed_metrics(
                 split=speed_metrics_prefix,
                 start_time=t0,
-                num_samples=num_batches * batch_size
-                if t1 == t0 or use_warmup_inference_steps
-                else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches * batch_size * num_inference_steps,
+                num_samples=num_samples,
+                num_steps=num_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 5432388229..c0a59fd5ec 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -95,6 +95,34 @@ class GaudiStableDiffusionDepth2ImgPipeline(GaudiDiffusionPipeline, StableDiffus
         - Add HPU Graphs
         - Depth map is now generated by CPU
         - Changed the logic of setting timestep
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`~transformers.CLIPTokenizer`):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`KarrasDiffusionSchedulers`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        depth_estimator ([`DPTForDepthEstimation`]):
+            Estimates depth in generated images
+        feature_extractor ([`DPTFeatureExtractor`]):
+            A feature extractor to extract features from generated images
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -110,6 +138,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -117,6 +146,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusionDepth2ImgPipeline.__init__(
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index b2a419389b..7cd8d23ade 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -71,6 +71,8 @@ class GaudiStableDiffusionImageVariationPipeline(GaudiDiffusionPipeline, StableD
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -86,6 +88,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -93,6 +96,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         # Workaround for Synapse 1.11 for full bf16
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index d7e36e983a..fceca2d4af 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -96,6 +96,39 @@ class GaudiStableDiffusionImg2ImgPipeline(GaudiDiffusionPipeline, StableDiffusio
     Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L161
     Changes:
         1. Use CPU to generate random tensor
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`~transformers.CLIPTokenizer`):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Pre-trained CLIP vision model used to obtain image features.
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -113,6 +146,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -120,6 +154,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusionImg2ImgPipeline.__init__(
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 2884831732..ebbf428f9e 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -82,6 +82,8 @@ class GaudiStableDiffusionInpaintPipeline(GaudiDiffusionPipeline, StableDiffusio
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     _callback_tensor_inputs = ["latents", "prompt_embeds", "mask", "masked_image_latents"]
@@ -101,6 +103,7 @@ def __init__(
         use_hpu_graphs: bool = True,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -108,6 +111,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusionInpaintPipeline.__init__(
@@ -527,7 +531,7 @@ def __call__(
                         f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                         f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                         f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                        f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
                         " `pipeline.unet` or your `mask_image` or `image` input."
                     )
             elif num_channels_unet != 4:
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 0f8eb39f92..a2a7ec1399 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -72,6 +72,8 @@ class GaudiStableDiffusionInstructPix2PixPipeline(GaudiDiffusionPipeline, Stable
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -89,6 +91,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -96,6 +99,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         # Workaround for Synapse 1.11 for full bf16
@@ -360,7 +364,7 @@ def __call__(
                     f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                     f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                     f" `num_channels_image`: {num_channels_image} "
-                    f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                    f" = {num_channels_latents + num_channels_image}. Please verify the config of"
                     " `pipeline.unet` or your `image` input."
                 )
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
index 704c3c1cf1..72b7ab838b 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -84,6 +84,8 @@ class GaudiStableDiffusionLDM3DPipeline(GaudiDiffusionPipeline, StableDiffusionL
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -101,6 +103,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -108,6 +111,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         # Workaround for Synapse 1.11 for full bf16
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 58f2f977a9..e1f4804df7 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -90,6 +90,8 @@ class GaudiStableDiffusionUpscalePipeline(GaudiDiffusionPipeline, StableDiffusio
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -108,8 +110,16 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
-        GaudiDiffusionPipeline.__init__(self, use_habana, use_hpu_graphs, gaudi_config, bf16_full_eval)
+        GaudiDiffusionPipeline.__init__(
+            self,
+            use_habana,
+            use_hpu_graphs,
+            gaudi_config,
+            bf16_full_eval,
+            sdp_on_bf16,
+        )
 
         # Workaround for Synapse 1.11 for full bf16
         if bf16_full_eval:
@@ -413,7 +423,7 @@ def __call__(
                     f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                     f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                     f" `num_channels_image`: {num_channels_image} "
-                    f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                    f" = {num_channels_latents + num_channels_image}. Please verify the config of"
                     " `pipeline.unet` or your `image` input."
                 )
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
index 805584d64c..731e0434bf 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -20,6 +20,8 @@
 import numpy as np
 import PIL
 import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
 from diffusers.models.autoencoders import AutoencoderKL
 from diffusers.models.transformers import SD3Transformer2DModel
 from diffusers.pipelines.stable_diffusion_3 import StableDiffusion3Pipeline
@@ -76,6 +78,101 @@ class GaudiStableDiffusion3PipelineOutput(BaseOutput):
 """
 
 
+# ToDo: Look into FusedJointAttnProcessor2_0 usage for sd3 pipeline, and check its perf using fused sdpa
+class GaudiJointAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections.
+    Copied from JointAttnProcessor2_0.forward: https://github.com/huggingface/diffusers/blob/89e4d6219805975bd7d253a267e1951badc9f1c0/src/diffusers/models/attention_processor.py
+        The only differences are:
+        - applied Fused SDPA from Habana's framework.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+
+            query = torch.cat([query, encoder_hidden_states_query_proj], dim=2)
+            key = torch.cat([key, encoder_hidden_states_key_proj], dim=2)
+            value = torch.cat([value, encoder_hidden_states_value_proj], dim=2)
+
+        from habana_frameworks.torch.hpex.kernels import FusedSDPA
+
+        hidden_states = FusedSDPA.apply(query, key, value, None, 0.0, False, None, "fast", None)
+
+        # hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if encoder_hidden_states is not None:
+            # Split the attention outputs.
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : residual.shape[1]],
+                hidden_states[:, residual.shape[1] :],
+            )
+            if not attn.context_pre_only:
+                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
 class GaudiStableDiffusion3Pipeline(GaudiDiffusionPipeline, StableDiffusion3Pipeline):
     r"""
     Adapted from: https://github.com/huggingface/diffusers/blob/v0.29.2/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py#L128
@@ -110,6 +207,18 @@ class GaudiStableDiffusion3Pipeline(GaudiDiffusionPipeline, StableDiffusion3Pipe
         tokenizer_3 (`T5TokenizerFast`):
             Tokenizer of class
             [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -127,6 +236,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -134,6 +244,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusion3Pipeline.__init__(
@@ -151,6 +262,93 @@ def __init__(
 
         self.to(self._device)
 
+    @classmethod
+    def _split_inputs_into_batches(
+        cls,
+        batch_size,
+        latents,
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ):
+        # Use torch.split to generate num_batches batches of size batch_size
+        latents_batches = list(torch.split(latents, batch_size))
+        prompt_embeds_batches = list(torch.split(prompt_embeds, batch_size))
+
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds_batches = list(torch.split(negative_prompt_embeds, batch_size))
+        if pooled_prompt_embeds is not None:
+            pooled_prompt_embeds_batches = list(torch.split(pooled_prompt_embeds, batch_size))
+        if negative_pooled_prompt_embeds is not None:
+            negative_pooled_prompt_embeds_batches = list(torch.split(negative_pooled_prompt_embeds, batch_size))
+
+        # If the last batch has less samples than batch_size, pad it with dummy samples
+        num_dummy_samples = 0
+        if latents_batches[-1].shape[0] < batch_size:
+            num_dummy_samples = batch_size - latents_batches[-1].shape[0]
+            # Pad latents_batches
+            sequence_to_stack = (latents_batches[-1],) + tuple(
+                torch.zeros_like(latents_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            latents_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad prompt_embeds_batches
+            sequence_to_stack = (prompt_embeds_batches[-1],) + tuple(
+                torch.zeros_like(prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad negative_prompt_embeds_batches if necessary
+            if negative_prompt_embeds is not None:
+                sequence_to_stack = (negative_prompt_embeds_batches[-1],) + tuple(
+                    torch.zeros_like(negative_prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+                )
+                negative_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad add_text_embeds_batches if necessary
+            if pooled_prompt_embeds is not None:
+                sequence_to_stack = (pooled_prompt_embeds_batches[-1],) + tuple(
+                    torch.zeros_like(pooled_prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+                )
+                pooled_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad negative_pooled_prompt_embeds_batches if necessary
+            if negative_pooled_prompt_embeds is not None:
+                sequence_to_stack = (negative_pooled_prompt_embeds_batches[-1],) + tuple(
+                    torch.zeros_like(negative_pooled_prompt_embeds_batches[-1][0][None, :])
+                    for _ in range(num_dummy_samples)
+                )
+                negative_pooled_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+
+        # Stack batches in the same tensor
+        latents_batches = torch.stack(latents_batches)
+        # if self.do_classifier_free_guidance:
+
+        if negative_prompt_embeds is not None:
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            for i, (negative_prompt_embeds_batch, prompt_embeds_batch) in enumerate(
+                zip(negative_prompt_embeds_batches, prompt_embeds_batches[:])
+            ):
+                prompt_embeds_batches[i] = torch.cat([negative_prompt_embeds_batch, prompt_embeds_batch])
+
+        prompt_embeds_batches = torch.stack(prompt_embeds_batches)
+
+        if pooled_prompt_embeds is not None:
+            if negative_pooled_prompt_embeds is not None:
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                for i, (negative_pooled_prompt_embeds_batch, pooled_prompt_embeds_batch) in enumerate(
+                    zip(negative_pooled_prompt_embeds_batches, pooled_prompt_embeds_batches[:])
+                ):
+                    pooled_prompt_embeds_batches[i] = torch.cat(
+                        [negative_pooled_prompt_embeds_batch, pooled_prompt_embeds_batch]
+                    )
+            pooled_prompt_embeds_batches = torch.stack(pooled_prompt_embeds_batches)
+        else:
+            pooled_prompt_embeds_batches = None
+
+        return latents_batches, prompt_embeds_batches, pooled_prompt_embeds_batches, num_dummy_samples
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -288,9 +486,39 @@ def __call__(
             [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
+        import habana_frameworks.torch as ht
         import habana_frameworks.torch.core as htcore
 
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
+        # Set dtype to BF16 only if --bf16 is used, else use device's default autocast precision
+        # When --bf16 is used, bf16_full_eval=True, which disables use_torch_autocast
+        with torch.autocast(
+            device_type="hpu",
+            enabled=self.gaudi_config.use_torch_autocast,
+            dtype=torch.bfloat16 if not self.gaudi_config.use_torch_autocast else None,
+        ):
+            quant_mode = kwargs.get("quant_mode", None)
+            if quant_mode == "measure" or quant_mode == "quantize":
+                import os
+
+                quant_config_path = os.getenv("QUANT_CONFIG")
+
+                if not quant_config_path:
+                    raise ImportError(
+                        "Error: QUANT_CONFIG path is not defined. Please define path to quantization configuration JSON file."
+                    )
+                elif not os.path.isfile(quant_config_path):
+                    raise ImportError(f"Error: QUANT_CONFIG path '{quant_config_path}' is not valid")
+
+                htcore.hpu_set_env()
+                from neural_compressor.torch.quantization import FP8Config, convert, prepare
+
+                config = FP8Config.from_json_file(quant_config_path)
+                if config.measure:
+                    self.transformer = prepare(self.transformer, config)
+                elif config.quantize:
+                    self.transformer = convert(self.transformer, config)
+                htcore.hpu_initialize(self.transformer, mark_only_scales_as_const=True)
+
             height = height or self.default_sample_size * self.vae_scale_factor
             width = width or self.default_sample_size * self.vae_scale_factor
 
@@ -319,14 +547,17 @@ def __call__(
 
             # 2. Define call parameters
             if prompt is not None and isinstance(prompt, str):
-                batch_size = 1
+                num_prompts = 1
             elif prompt is not None and isinstance(prompt, list):
-                batch_size = len(prompt)
+                num_prompts = len(prompt)
             else:
-                batch_size = prompt_embeds.shape[0]
+                num_prompts = prompt_embeds.shape[0]
+            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
 
             device = self._execution_device
 
+            lora_scale = kwargs.get("lora_scale", None) if kwargs is not None else None
+
             (
                 prompt_embeds,
                 negative_prompt_embeds,
@@ -348,12 +579,9 @@ def __call__(
                 clip_skip=self.clip_skip,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
             )
 
-            if self.do_classifier_free_guidance:
-                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-                pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
-
             # 4. Prepare timesteps
             timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
             num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
@@ -362,7 +590,7 @@ def __call__(
             # 5. Prepare latent variables
             num_channels_latents = self.transformer.config.in_channels
             latents = self.prepare_latents(
-                batch_size * num_images_per_prompt,
+                num_prompts * num_images_per_prompt,
                 num_channels_latents,
                 height,
                 width,
@@ -372,14 +600,6 @@ def __call__(
                 latents,
             )
 
-            # 5-1. Define call parameters
-            if prompt is not None and isinstance(prompt, str):
-                num_prompts = 1
-            elif prompt is not None and isinstance(prompt, list):
-                num_prompts = len(prompt)
-            else:
-                num_prompts = prompt_embeds.shape[0]
-            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
             logger.info(
                 f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
                 f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
@@ -388,40 +608,86 @@ def __call__(
                 logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
 
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
+            use_warmup_inference_steps = (
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+            )
 
-            t0 = time.time()
-            t1 = t0
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index()
 
             hb_profiler = HabanaProfile(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
             )
+
             hb_profiler.start()
 
-            # 6. Denoising loop
-            with self.progress_bar(total=num_inference_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    # because compilation occurs in the first two iterations
-                    if i == throughput_warmup_steps:
+            # 6. Split Input data to batches (HPU-specific step)
+            latents_batches, text_embeddings_batches, pooled_prompt_embeddings_batches, num_dummy_samples = (
+                self._split_inputs_into_batches(
+                    batch_size,
+                    latents,
+                    prompt_embeds,
+                    negative_prompt_embeds,
+                    pooled_prompt_embeds,
+                    negative_pooled_prompt_embeds,
+                )
+            )
+
+            outputs = {
+                "images": [],
+            }
+
+            for block in self.transformer.transformer_blocks:
+                block.attn.processor = GaudiJointAttnProcessor2_0()
+            ht.hpu.synchronize()
+
+            t0 = time.time()
+            t1 = t0
+
+            # 7. Denoising loop
+            for j in range(num_batches):
+                latents_batch = latents_batches[0]
+                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
+                text_embeddings_batch = text_embeddings_batches[0]
+                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
+                pooled_prompt_embeddings_batch = pooled_prompt_embeddings_batches[0]
+                pooled_prompt_embeddings_batches = torch.roll(pooled_prompt_embeddings_batches, shifts=-1, dims=0)
+
+                if hasattr(self.scheduler, "_init_step_index"):
+                    # Reset scheduler step index for next batch
+                    self.scheduler.timesteps = timesteps
+                    self.scheduler._init_step_index(timesteps[0])
+
+                # Throughput is calculated after warmup iterations
+                if j == throughput_warmup_steps:
+                    t1 = time.time()
+
+                for i in self.progress_bar(range(len(timesteps))):
+                    timestep = timesteps[0]
+                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
+
+                    if use_warmup_inference_steps and i == throughput_warmup_steps and j == num_batches - 1:
                         t1 = time.time()
 
                     if self.interrupt:
                         continue
 
                     # expand the latents if we are doing classifier free guidance
-                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = (
+                        torch.cat([latents_batch] * 2) if self.do_classifier_free_guidance else latents_batch
+                    )
                     # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                    timestep = t.expand(latent_model_input.shape[0])
+                    timestep_batch = timestep.expand(latent_model_input.shape[0])
 
-                    noise_pred = self.transformer(
-                        hidden_states=latent_model_input,
-                        timestep=timestep,
-                        encoder_hidden_states=prompt_embeds,
-                        pooled_projections=pooled_prompt_embeds,
-                        joint_attention_kwargs=self.joint_attention_kwargs,
-                        return_dict=False,
-                    )[0]
+                    noise_pred = self.transformer_hpu(
+                        latent_model_input,
+                        timestep_batch,
+                        text_embeddings_batch,
+                        pooled_prompt_embeddings_batch,
+                        self.joint_attention_kwargs,
+                    )
 
                     # perform guidance
                     if self.do_classifier_free_guidance:
@@ -429,62 +695,173 @@ def __call__(
                         noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                     # compute the previous noisy sample x_t -> x_t-1
-                    latents_dtype = latents.dtype
-                    latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-
-                    if latents.dtype != latents_dtype:
-                        if torch.backends.mps.is_available():
-                            # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                            latents = latents.to(latents_dtype)
+                    latents_dtype = latents_batch.dtype
+                    latents_batch = self.scheduler.step(noise_pred, timestep, latents_batch, return_dict=False)[0]
 
                     if callback_on_step_end is not None:
                         callback_kwargs = {}
                         for k in callback_on_step_end_tensor_inputs:
                             callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                        callback_outputs = callback_on_step_end(self, i, timestep, callback_kwargs)
 
-                        latents = callback_outputs.pop("latents", latents)
-                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-                        negative_pooled_prompt_embeds = callback_outputs.pop(
-                            "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
-                        )
+                        latents_batch = callback_outputs.pop("latents", latents_batch)
 
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
+                        _prompt_embeds = callback_outputs.pop("prompt_embeds", None)
+                        _negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", None)
+                        if _prompt_embeds is not None and _negative_prompt_embeds is not None:
+                            text_embeddings_batch = torch.cat([_negative_prompt_embeds, _prompt_embeds])
+                        _pooled_prompt_embeds = callback_outputs.pop("pooled_prompt_embeds", None)
+                        _negative_pooled_prompt_embeds = callback_outputs.pop("negative_pooled_prompt_embeds", None)
+                        if _pooled_prompt_embeds is not None and _negative_pooled_prompt_embeds is not None:
+                            pooled_prompt_embeddings_batch = torch.cat(
+                                [_negative_pooled_prompt_embeds, _pooled_prompt_embeds]
+                            )
 
                     hb_profiler.step()
                     htcore.mark_step(sync=True)
 
-                hb_profiler.stop()
+                if output_type == "latent":
+                    image = latents_batch
 
-                t1 = warmup_inference_steps_time_adjustment(t1, t1, num_inference_steps, throughput_warmup_steps)
-                speed_metrics_prefix = "generation"
-                speed_measures = speed_metrics(
-                    split=speed_metrics_prefix,
-                    start_time=t0,
-                    num_samples=num_batches * batch_size,
-                    num_steps=num_batches * batch_size * num_inference_steps,
-                    start_time_after_warmup=t1,
-                )
-                logger.info(f"Speed metrics: {speed_measures}")
-            if output_type == "latent":
-                image = latents
+                else:
+                    latents_batch = (latents_batch / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+                    image = self.vae.decode(latents_batch, return_dict=False)[0]
+                    image = self.image_processor.postprocess(image, output_type=output_type)
 
-            else:
-                latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+                outputs["images"].append(image)
+
+            # End of Denoising loop
+
+            hb_profiler.stop()
 
-                image = self.vae.decode(latents, return_dict=False)[0]
-                image = self.image_processor.postprocess(image, output_type=output_type)
+            ht.hpu.synchronize()
+            speed_metrics_prefix = "generation"
+            if use_warmup_inference_steps:
+                t1 = warmup_inference_steps_time_adjustment(t1, t1, num_inference_steps, throughput_warmup_steps)
+            speed_measures = speed_metrics(
+                split=speed_metrics_prefix,
+                start_time=t0,
+                num_samples=batch_size
+                if t1 == t0 or use_warmup_inference_steps
+                else (num_batches - throughput_warmup_steps) * batch_size,
+                num_steps=batch_size * num_inference_steps
+                if use_warmup_inference_steps
+                else (num_batches - throughput_warmup_steps) * batch_size * num_inference_steps,
+                start_time_after_warmup=t1,
+            )
+            logger.info(f"Speed metrics: {speed_measures}")
+
+            if quant_mode == "measure":
+                from neural_compressor.torch.quantization import finalize_calibration
+
+                finalize_calibration(self.transformer)
+
+            # 8 Output Images
+            # Remove dummy generations if needed
+            if num_dummy_samples > 0:
+                outputs["images"][-1] = outputs["images"][-1][:-num_dummy_samples]
+
+            # Process generated images
+            for i, image in enumerate(outputs["images"][:]):
+                if i == 0:
+                    outputs["images"].clear()
+
+                # image = self.image_processor.postprocess(image, output_type=output_type)
+
+                if output_type == "pil" and isinstance(image, list):
+                    outputs["images"] += image
+                elif output_type in ["np", "numpy"] and isinstance(image, np.ndarray):
+                    if len(outputs["images"]) == 0:
+                        outputs["images"] = image
+                    else:
+                        outputs["images"] = np.concatenate((outputs["images"], image), axis=0)
+                else:
+                    if len(outputs["images"]) == 0:
+                        outputs["images"] = image
+                    else:
+                        outputs["images"] = torch.cat((outputs["images"], image), 0)
 
             # Offload all models
             self.maybe_free_model_hooks()
 
             if not return_dict:
-                return (image,)
+                return outputs["images"]
 
             return GaudiStableDiffusion3PipelineOutput(
-                images=image,
+                images=outputs["images"],
                 throughput=speed_measures[f"{speed_metrics_prefix}_samples_per_second"],
             )
+
+    @torch.no_grad()
+    def transformer_hpu(
+        self,
+        latent_model_input,
+        timestep,
+        text_embeddings_batch,
+        pooled_prompt_embeddings_batch,
+        joint_attention_kwargs,
+    ):
+        if self.use_hpu_graphs:
+            return self.capture_replay(
+                latent_model_input,
+                timestep,
+                text_embeddings_batch,
+                pooled_prompt_embeddings_batch,
+                joint_attention_kwargs,
+            )
+        else:
+            return self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=text_embeddings_batch,
+                pooled_projections=pooled_prompt_embeddings_batch,
+                joint_attention_kwargs=joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+    @torch.no_grad()
+    def capture_replay(
+        self,
+        latent_model_input,
+        timestep,
+        encoder_hidden_states,
+        pooled_prompt_embeddings_batch,
+        joint_attention_kwargs,
+    ):
+        inputs = [
+            latent_model_input,
+            timestep,
+            encoder_hidden_states,
+            pooled_prompt_embeddings_batch,
+            joint_attention_kwargs,
+        ]
+        h = self.ht.hpu.graphs.input_hash(inputs)
+        cached = self.cache.get(h)
+
+        if cached is None:
+            # Capture the graph and cache it
+            with self.ht.hpu.stream(self.hpu_stream):
+                graph = self.ht.hpu.HPUGraph()
+                graph.capture_begin()
+
+                outputs = self.transformer(
+                    hidden_states=inputs[0],
+                    timestep=inputs[1],
+                    encoder_hidden_states=inputs[2],
+                    pooled_projections=inputs[3],
+                    joint_attention_kwargs=inputs[4],
+                    return_dict=False,
+                )[0]
+
+                graph.capture_end()
+                graph_inputs = inputs
+                graph_outputs = outputs
+                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
+            return outputs
+
+        # Replay the cached graph with updated inputs
+        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
+        cached.graph.replay()
+        self.ht.core.hpu.default_stream().synchronize()
+
+        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index f4a0dbd244..610f8eabba 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time
 from dataclasses import dataclass
 from math import ceil
@@ -38,6 +37,7 @@
 
 from ....transformers.gaudi_configuration import GaudiConfig
 from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
+from ...models.unet_2d_condition import set_default_attn_processor_hpu
 from ..pipeline_utils import GaudiDiffusionPipeline
 from ..stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
 
@@ -97,6 +97,8 @@ class GaudiStableDiffusionXLPipeline(GaudiDiffusionPipeline, StableDiffusionXLPi
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -115,6 +117,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -122,6 +125,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusionXLPipeline.__init__(
@@ -138,6 +142,8 @@ def __init__(
             force_zeros_for_empty_prompt,
         )
 
+        self.unet.set_default_attn_processor = set_default_attn_processor_hpu
+
         self.to(self._device)
 
     def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None):
@@ -651,8 +657,6 @@ def __call__(
             t1 = t0
 
             self._num_timesteps = len(timesteps)
-            if hasattr(self.scheduler, "set_begin_index"):
-                self.scheduler.set_begin_index()
 
             hb_profiler = HabanaProfile(
                 warmup=profiling_warmup_steps,
@@ -818,7 +822,7 @@ def __call__(
             speed_metrics_prefix = "generation"
             if t1 == t0 or use_warmup_inference_steps:
                 num_samples = num_batches * batch_size
-                num_steps = (num_inference_steps - throughput_warmup_steps) * num_batches * batch_size
+                num_steps = num_inference_steps * num_batches * batch_size
             else:
                 num_samples = (num_batches - throughput_warmup_steps) * batch_size
                 num_steps = (num_batches - throughput_warmup_steps) * num_inference_steps * batch_size
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 7b6f25d920..6846e1a146 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -97,6 +97,8 @@ class GaudiStableDiffusionXLImg2ImgPipeline(GaudiDiffusionPipeline, StableDiffus
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -117,6 +119,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -124,6 +127,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusionXLImg2ImgPipeline.__init__(
@@ -536,8 +540,6 @@ def denoising_value_valid(dnv):
                 ).to(device=device, dtype=latents.dtype)
 
             self._num_timesteps = len(timesteps)
-            if hasattr(self.scheduler, "set_begin_index"):
-                self.scheduler.set_begin_index()
 
             # 8.3 Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 8d94596e3b..3270e14224 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -133,6 +133,8 @@ class GaudiStableDiffusionXLInpaintPipeline(GaudiDiffusionPipeline, StableDiffus
         bf16_full_eval (bool, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit.
             This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     _callback_tensor_inputs = [
@@ -162,6 +164,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -169,6 +172,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableDiffusionXLInpaintPipeline.__init__(
@@ -639,7 +643,7 @@ def denoising_value_valid(dnv):
                         f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
                         f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
                         f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                        f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                        f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
                         " `pipeline.unet` or your `mask_image` or `image` input."
                     )
             elif num_channels_unet != 4:
@@ -744,8 +748,6 @@ def denoising_value_valid(dnv):
                 ).to(device=device, dtype=latents.dtype)
 
             self._num_timesteps = len(timesteps)
-            if hasattr(self.scheduler, "set_begin_index"):
-                self.scheduler.set_begin_index()
 
             outputs = {
                 "images": [],
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
index 42c703b78b..3cca208954 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
@@ -159,7 +159,10 @@ def __init__(
         feature_extractor: CLIPImageProcessor = None,
         force_zeros_for_empty_prompt: bool = True,
         add_watermarker: Optional[bool] = None,
+        sdp_on_bf16: bool = False,
     ):
+        if sdp_on_bf16:
+            torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
         super().__init__(
             vae,
             text_encoder,
@@ -757,7 +760,7 @@ def __call__(
 
         throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
         use_warmup_inference_steps = (
-            num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+            num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
         )
 
         self._num_timesteps = len(timesteps)
@@ -769,9 +772,6 @@ def __call__(
             if j == throughput_warmup_steps:
                 ht.hpu.synchronize()
                 t1 = time.time()
-            if use_warmup_inference_steps:
-                ht.hpu.synchronize()
-                t0_inf = time.time()
 
             latents = latents_batches[0]
             latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
@@ -841,10 +841,9 @@ def __call__(
                     hb_profiler.step()
             else:
                 for i in range(num_inference_steps):
-                    if use_warmup_inference_steps and i == throughput_warmup_steps:
+                    if use_warmup_inference_steps and i == throughput_warmup_steps and j == num_batches - 1:
                         ht.hpu.synchronize()
-                        t1_inf = time.time()
-                        t1 += t1_inf - t0_inf
+                        t1 = time.time()
 
                     t = timesteps[0]
                     timesteps = torch.roll(timesteps, shifts=-1, dims=0)
@@ -875,9 +874,10 @@ def __call__(
                         callback_on_step_end_tensor_inputs,
                     )
                     hb_profiler.step()
-            if use_warmup_inference_steps:
+            if use_warmup_inference_steps and j == num_batches - 1:
                 ht.hpu.synchronize()
-                t1 = warmup_inference_steps_time_adjustment(t1, t1_inf, num_inference_steps, throughput_warmup_steps)
+                t1 = warmup_inference_steps_time_adjustment(t1, t1, num_inference_steps, throughput_warmup_steps)
+                t_vae_b = time.time()
 
             if not output_type == "latent":
                 # make sure the VAE is in float32 mode, as it overflows in float16
@@ -896,19 +896,21 @@ def __call__(
                 image = latents
 
             output_images.append(image)
+            if use_warmup_inference_steps and j == num_batches - 1:
+                ht.hpu.synchronize()
+                t_vae_e = time.time()
+                t1 = t1 + t_vae_e - t_vae_b
 
         hb_profiler.stop()
 
         speed_metrics_prefix = "generation"
         ht.hpu.synchronize()
-
         if t1 == t0 or use_warmup_inference_steps:
-            num_samples = num_batches * batch_size
-            num_steps = (num_inference_steps - throughput_warmup_steps) * num_batches * batch_size
+            num_samples = batch_size
+            num_steps = batch_size * num_inference_steps
         else:
             num_samples = (num_batches - throughput_warmup_steps) * batch_size
             num_steps = (num_batches - throughput_warmup_steps) * num_inference_steps * batch_size
-
         speed_measures = speed_metrics(
             split=speed_metrics_prefix,
             start_time=t0,
diff --git a/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 25f122c960..ec46283461 100644
--- a/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -31,7 +31,8 @@
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
 from ....transformers.gaudi_configuration import GaudiConfig
-from ....utils import speed_metrics
+from ....utils import HabanaProfile, speed_metrics
+from ...models.unet_2d_condition import set_default_attn_processor_hpu
 from ..pipeline_utils import GaudiDiffusionPipeline
 
 
@@ -77,6 +78,18 @@ class GaudiStableVideoDiffusionPipeline(GaudiDiffusionPipeline, StableVideoDiffu
             A scheduler to be used in combination with `unet` to denoise the encoded image latents.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images.
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -90,6 +103,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -97,6 +111,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
 
         StableVideoDiffusionPipeline.__init__(
@@ -107,7 +122,13 @@ def __init__(
             scheduler,
             feature_extractor,
         )
+        if use_habana:
+            self.unet.set_default_attn_processor = set_default_attn_processor_hpu
+            self.unet.set_default_attn_processor(self.unet)
+        if use_hpu_graphs:
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 
+            self.unet = wrap_in_hpu_graph(self.unet, disable_tensor_cache=True)
         self.to(self._device)
 
     @classmethod
@@ -240,6 +261,8 @@ def __call__(
         output_type: Optional[str] = "pil",
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
         return_dict: bool = True,
         **kwargs,
     ):
@@ -372,7 +395,7 @@ def __call__(
             # 4. Encode input image using VAE
             image = self.video_processor.preprocess(image, height=height, width=width)
             # torch.randn is broken on HPU so running it on CPU
-            rand_device = "cpu" if device.type == "hpu" else device
+            rand_device = torch.device("cpu") if device.type == "hpu" else device
             noise = randn_tensor(image.shape, generator=generator, device=rand_device, dtype=image.dtype).to(device)
             # image = self.image_processor.preprocess(image, height=height, width=width).to(device)
             # noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
@@ -469,6 +492,13 @@ def __call__(
             t0 = time.time()
             t1 = t0
 
+            hb_profiler = HabanaProfile(
+                warmup=profiling_warmup_steps,
+                active=profiling_steps,
+                record_shapes=False,
+            )
+            hb_profiler.start()
+
             # 10. Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
@@ -525,6 +555,9 @@ def __call__(
                     # compute the previous noisy sample x_t -> x_t-1
                     latents_batch = self.scheduler.step(noise_pred, timestep, latents_batch).prev_sample
 
+                    if not self.use_hpu_graphs:
+                        self.htcore.mark_step()
+
                     if callback_on_step_end is not None:
                         callback_kwargs = {}
                         for k in callback_on_step_end_tensor_inputs:
@@ -532,6 +565,7 @@ def __call__(
                         callback_outputs = callback_on_step_end(self, i, timestep, callback_kwargs)
 
                         latents_batch = callback_outputs.pop("latents", latents_batch)
+                    hb_profiler.step()
 
                 if not output_type == "latent":
                     # cast back to fp16/bf16 if needed
@@ -544,7 +578,10 @@ def __call__(
                     frames = latents_batch
 
                 outputs["frames"].append(frames)
+                if not self.use_hpu_graphs:
+                    self.htcore.mark_step()
 
+            hb_profiler.stop()
             speed_metrics_prefix = "generation"
             speed_measures = speed_metrics(
                 split=speed_metrics_prefix,
diff --git a/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index ffaf25df11..133f87623d 100644
--- a/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -41,6 +41,31 @@ class GaudiTextToVideoSDPipelineOutput(BaseOutput):
 class GaudiTextToVideoSDPipeline(GaudiDiffusionPipeline, TextToVideoSDPipeline):
     r"""
     Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py#L84
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`~transformers.CLIPTokenizer`):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        use_habana (bool, defaults to `False`):
+            Whether to use Gaudi (`True`) or CPU (`False`).
+        use_hpu_graphs (bool, defaults to `False`):
+            Whether to use HPU graphs or not.
+        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
+            Gaudi configuration to use. Can be a string to download it from the Hub.
+            Or a previously initialized config can be passed.
+        bf16_full_eval (bool, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit.
+            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
+        sdp_on_bf16 (bool, defaults to `False`):
+            Whether to allow PyTorch to use reduced precision in the SDPA math backend.
     """
 
     def __init__(
@@ -54,6 +79,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         GaudiDiffusionPipeline.__init__(
             self,
@@ -61,6 +87,7 @@ def __init__(
             use_hpu_graphs,
             gaudi_config,
             bf16_full_eval,
+            sdp_on_bf16,
         )
         TextToVideoSDPipeline.__init__(
             self,
diff --git a/optimum/habana/distributed/contextparallel.py b/optimum/habana/distributed/contextparallel.py
index 0b48465542..2020b6a84e 100644
--- a/optimum/habana/distributed/contextparallel.py
+++ b/optimum/habana/distributed/contextparallel.py
@@ -1,6 +1,6 @@
 import torch
 
-from ..parallel_state import (
+from .parallel_state import (
     get_sequence_parallel_group,
     get_sequence_parallel_rank,
     get_sequence_parallel_world_size,
diff --git a/optimum/habana/parallel_state.py b/optimum/habana/distributed/parallel_state.py
similarity index 88%
rename from optimum/habana/parallel_state.py
rename to optimum/habana/distributed/parallel_state.py
index c370d88229..30a6c5f7f7 100644
--- a/optimum/habana/parallel_state.py
+++ b/optimum/habana/distributed/parallel_state.py
@@ -61,16 +61,13 @@
 # rank when broadcasting weights from src to all other data parallel ranks
 _DATA_PARALLEL_GLOBAL_RANKS = None
 
-# Memory buffers to avoid dynamic memory allocation
-_GLOBAL_MEMORY_BUFFER = None
-
 
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
-    sequence_parallel_size: int = 1,
     virtual_pipeline_model_parallel_size: Optional[int] = None,
     pipeline_model_parallel_split_rank: Optional[int] = None,
+    sequence_parallel_size: int = 1,
     use_fp8: bool = False,
     use_distributed_optimizer: bool = False,
 ) -> None:
@@ -112,11 +109,38 @@ def initialize_model_parallel(
             pipeline_model_parallel_split_rank is 3, then ranks 0-2
             will be the encoder and ranks 3-7 will be the decoder.
 
+        sequence_parallel_size (int, default = 1):
+            The number of tensor parallel GPU groups to split the
+            network input sequence length across. Compute of attention
+            module requires tokens of full sequence length, so GPUs
+            in a sequence parallel group need to communicate with each
+            other to exchange information of other sequence chunks.
+            Each GPU and its counterparts in other tensor parallel
+            groups compose a sequence parallel group.
+
+            For example, assume we have 8 GPUs, if tensor model parallel
+            size is 4 and sequence parallel size is 2, the network input
+            will be split into two sequence chunks, which are processed
+            by 2 different groups of 4 GPUs. One chunk is processed by
+            GPU0-3, the other chunk is processed by GPU4-7. Four groups
+            are build to do sequence parallel communications: [GPU0, GPU4],
+            [GPU1, GPU5], [GPU2, GPU6], and [GPU3, GPU7].
+
+            Sequence parallelism partitions sequence length, so it has no
+            impact on weights, which means weights are duplicated among
+            GPUs in a sequence parallel group. Hence, weight gradients
+            all-reduce is required in backward. For simplicity, we piggyback
+            GPUs of sequence parallelism on data parallel group for
+            weight gradient all-reduce.
+
         use_fp8 (bool, default = False):
             Construct GPU groups needed for FP8 training, namely for
             amax reduction across the product of the data-parallel and
             tensor-parallel groups.
 
+        use_distributed_optimizer (bool, default = False):
+            Create a new process group using Gloo backend
+
     Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
     use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
     the model pipeline. The present function will
@@ -138,17 +162,18 @@ def initialize_model_parallel(
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
 
-    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
+    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size * sequence_parallel_size) != 0:
         raise RuntimeError(
             f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
-            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size}) "
+            f"x sequence_parallel_size ({sequence_parallel_size})"
         )
 
     enable_ds_sequence_parallel = sequence_parallel_size > 1
     if enable_ds_sequence_parallel:
-        assert (
-            tensor_model_parallel_size == 1 and pipeline_model_parallel_size == 1
-        ), "DeepSpeed's sequence parallel does not work with tensor parallel or pipeline parallel"
+        assert tensor_model_parallel_size == 1 and pipeline_model_parallel_size == 1, (
+            "DeepSpeed's sequence parallel does not work with tensor parallel or pipeline parallel"
+        )
 
         if world_size % sequence_parallel_size != 0:
             raise RuntimeError(
@@ -168,7 +193,7 @@ def initialize_model_parallel(
 
     if virtual_pipeline_model_parallel_size is not None:
         if not pipeline_model_parallel_size > 2:
-            raise RuntimeError("pipeline-model-parallel size should be greater than 2 with " "interleaved schedule")
+            raise RuntimeError("pipeline-model-parallel size should be greater than 2 with interleaved schedule")
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
         _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
@@ -185,12 +210,14 @@ def initialize_model_parallel(
     global _DATA_PARALLEL_GROUP_GLOO
     global _DATA_PARALLEL_GLOBAL_RANKS
     assert _DATA_PARALLEL_GROUP is None, "data parallel group is already initialized"
+
+    # Build the data-parallel groups.
     all_data_parallel_group_ranks = []
     for i in range(pipeline_model_parallel_size):
         start_rank = i * num_pipeline_model_parallel_groups
         end_rank = (i + 1) * num_pipeline_model_parallel_groups
 
-        if sequence_parallel_size > 1:
+        if enable_ds_sequence_parallel:
             tp_or_sp_size = sequence_parallel_size
         else:
             tp_or_sp_size = tensor_model_parallel_size
@@ -211,11 +238,14 @@ def initialize_model_parallel(
     # Build the sequence parallel groups.
     global _SEQUENCE_PARALLEL_GROUP
     assert _SEQUENCE_PARALLEL_GROUP is None, "sequence parallel group is already initialized"
+
     for i in range(num_sequence_parallel_groups):
         ranks = range(i * sequence_parallel_size, (i + 1) * sequence_parallel_size)
         group = torch.distributed.new_group(ranks)
         if rank in ranks:
             _SEQUENCE_PARALLEL_GROUP = group
+
+    global _SEQUENCE_PARALLEL_WORLD_SIZE
     _SEQUENCE_PARALLEL_WORLD_SIZE = sequence_parallel_size
 
     global _TRAINING_MODE
@@ -224,6 +254,7 @@ def initialize_model_parallel(
     # Build the sequence data parallel groups.
     global _SEQUENCE_DATA_PARALLEL_GROUP
     assert _SEQUENCE_DATA_PARALLEL_GROUP is None, "sequence data parallel group is already initialized"
+
     all_data_sequence_parallel_group_ranks = []
     if enable_ds_sequence_parallel:
         for i in range(num_sequence_data_parallel_groups):
@@ -238,6 +269,7 @@ def initialize_model_parallel(
     # Build the model-parallel groups.
     global _MODEL_PARALLEL_GROUP
     assert _MODEL_PARALLEL_GROUP is None, "model parallel group is already initialized"
+
     num_model_parallel_groups = sequence_data_parallel_size if enable_ds_sequence_parallel else data_parallel_size
     model_parallel_group_ranks = (
         all_data_sequence_parallel_group_ranks if enable_ds_sequence_parallel else all_data_parallel_group_ranks
@@ -251,6 +283,7 @@ def initialize_model_parallel(
     # Build the tensor model-parallel groups.
     global _TENSOR_MODEL_PARALLEL_GROUP
     assert _TENSOR_MODEL_PARALLEL_GROUP is None, "tensor model parallel group is already initialized"
+
     for i in range(num_tensor_model_parallel_groups):
         ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
         group = torch.distributed.new_group(ranks)
@@ -268,6 +301,7 @@ def initialize_model_parallel(
     global _POSITION_EMBEDDING_GROUP
     global _POSITION_EMBEDDING_GLOBAL_RANKS
     assert _POSITION_EMBEDDING_GROUP is None, "position embedding group is already initialized"
+
     for i in range(num_pipeline_model_parallel_groups):
         ranks = range(i, world_size, num_pipeline_model_parallel_groups)
         group = torch.distributed.new_group(ranks)
@@ -303,6 +337,7 @@ def initialize_model_parallel(
     # Build the FP8 groups.
     global _AMAX_REDUCTION_GROUP
     assert _AMAX_REDUCTION_GROUP is None, "FP8 amax reduction group is already initialized"
+
     if use_fp8:
         amax_group_size: int = tensor_model_parallel_size * data_parallel_size
         num_amax_groups: int = world_size // amax_group_size
@@ -322,9 +357,8 @@ def is_unitialized():
 
 def is_training_mode():
     """Useful for code segments that may be accessed with or without mpu initialization"""
-    global _TRAINING_MODE
-    if _TRAINING_MODE is True:
-        return True
+    if _TRAINING_MODE is not None:
+        return _TRAINING_MODE
     else:
         return False
 
@@ -465,3 +499,10 @@ def get_sequence_parallel_src_rank():
     global_rank = torch.distributed.get_rank()
     local_world_size = get_sequence_parallel_world_size()
     return (global_rank // local_world_size) * local_world_size
+
+
+def amax_reduction_is_initialized():
+    """Check if FP8 amax reduction groups are initialized."""
+    if _AMAX_REDUCTION_GROUP is None:
+        return False
+    return True
diff --git a/optimum/habana/distributed/serialization.py b/optimum/habana/distributed/serialization.py
index bf59fb2445..14842d24ca 100644
--- a/optimum/habana/distributed/serialization.py
+++ b/optimum/habana/distributed/serialization.py
@@ -191,9 +191,9 @@ def load_state_dict(
     assert len(checkpoints) > 0, f"Can't find the requested checkpoint data at {model_path}"
 
     if checkpoint_sharding is not None and checkpoint_sharding != "layer":
-        assert (
-            world_size == len(checkpoints)
-        ), f"Loading a {checkpoint_sharding}-sharded checkpoint with len={len(checkpoints)} but world size is {world_size}"
+        assert world_size == len(checkpoints), (
+            f"Loading a {checkpoint_sharding}-sharded checkpoint with len={len(checkpoints)} but world size is {world_size}"
+        )
 
         checkpoints = [checkpoints[rank]]
 
diff --git a/optimum/habana/quantizers/bitsandbytes.py b/optimum/habana/quantizers/bitsandbytes.py
new file mode 100644
index 0000000000..ee56b55d53
--- /dev/null
+++ b/optimum/habana/quantizers/bitsandbytes.py
@@ -0,0 +1,265 @@
+from functools import lru_cache
+from typing import Any, Dict, List, Optional
+
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import Conv1D
+from transformers.quantizers.quantizers_utils import get_module_from_name
+from transformers.utils import (
+    ACCELERATE_MIN_VERSION,
+    get_available_devices,
+    is_accelerate_available,
+    is_bitsandbytes_multi_backend_available,
+    is_ipex_available,
+    is_torch_available,
+    logging,
+)
+from transformers.utils.import_utils import _is_package_available
+
+
+if is_torch_available():
+    import torch
+
+_bitsandbytes_available = _is_package_available("bitsandbytes")
+logger = logging.get_logger(__name__)
+
+
+def gaudi_bitsandbytesconfig_post_init(self):
+    r"""
+    Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+    Copied from https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/utils/quantization_config.py#L430
+    Only difference is removed check on bitsandbytes version
+    """
+    if not isinstance(self.load_in_4bit, bool):
+        raise TypeError("load_in_4bit must be a boolean")
+
+    if not isinstance(self.load_in_8bit, bool):
+        raise TypeError("load_in_8bit must be a boolean")
+
+    if not isinstance(self.llm_int8_threshold, float):
+        raise TypeError("llm_int8_threshold must be a float")
+
+    if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list):
+        raise TypeError("llm_int8_skip_modules must be a list of strings")
+    if not isinstance(self.llm_int8_enable_fp32_cpu_offload, bool):
+        raise TypeError("llm_int8_enable_fp32_cpu_offload must be a boolean")
+
+    if not isinstance(self.llm_int8_has_fp16_weight, bool):
+        raise TypeError("llm_int8_has_fp16_weight must be a boolean")
+
+    if self.bnb_4bit_compute_dtype is not None and not isinstance(self.bnb_4bit_compute_dtype, torch.dtype):
+        raise TypeError("bnb_4bit_compute_dtype must be torch.dtype")
+
+    if not isinstance(self.bnb_4bit_quant_type, str):
+        raise TypeError("bnb_4bit_quant_type must be a string")
+
+    if not isinstance(self.bnb_4bit_use_double_quant, bool):
+        raise TypeError("bnb_4bit_use_double_quant must be a boolean")
+
+
+@lru_cache()
+def gaudi_is_bitsandbytes_available():
+    """
+    Copied from https://github.com/huggingface/transformers/blob/5523e38b553ff6c46b04d2376870fcd842feeecc/src/transformers/utils/import_utils.py#L871
+    Only difference is that CUDA related checks are removed.
+    """
+    if not is_torch_available() or not _bitsandbytes_available:
+        return False
+
+    # Newer versions of `bitsandbytes` can be imported on systems without CUDA.
+    return True
+
+
+def gaudi_validate_bnb_backend_availability(raise_exception=False):
+    """
+    Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
+    Copied from https://github.com/huggingface/transformers/blob/5523e38b553ff6c46b04d2376870fcd842feeecc/src/transformers/integrations/bitsandbytes.py#L545
+    Only difference is that CUDA related functions calls are deleted.
+    """
+    if is_bitsandbytes_multi_backend_available():
+        return _gaudi_validate_bnb_multi_backend_availability(raise_exception)
+
+
+def _gaudi_validate_bnb_multi_backend_availability(raise_exception):
+    """
+    Copied https://github.com/huggingface/transformers/blob/5523e38b553ff6c46b04d2376870fcd842feeecc/src/transformers/integrations/bitsandbytes.py#L484
+    Only difference is addition of check for HPU
+    """
+    import bitsandbytes as bnb
+
+    bnb_supported_devices = getattr(bnb, "supported_torch_devices", set())
+    available_devices = get_available_devices()
+
+    if "hpu" in bnb_supported_devices:
+        logger.debug("Multi-backend validation successful.")
+        return True
+
+    if available_devices == {"cpu"} and not is_ipex_available():
+        from importlib.util import find_spec
+
+        if find_spec("intel_extension_for_pytorch"):
+            logger.warning(
+                "You have Intel IPEX installed but if you're intending to use it for CPU, it might not have the right version. Be sure to double check that your PyTorch and IPEX installs are compatible."
+            )
+
+        available_devices.discard("cpu")  # Only Intel CPU is supported by BNB at the moment
+
+    if not available_devices.intersection(bnb_supported_devices):
+        if raise_exception:
+            bnb_supported_devices_with_info = set(  # noqa: C401
+                '"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)'
+                if device == "cpu"
+                else device
+                for device in bnb_supported_devices
+            )
+            err_msg = (
+                f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. "
+                "Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+            )
+
+            logger.error(err_msg)
+            raise RuntimeError(err_msg)
+
+        logger.warning("No supported devices found for bitsandbytes multi-backend.")
+        return False
+
+    logger.debug("Multi-backend validation successful.")
+    return True
+
+
+def gaudi_validate_environment(self, *args, **kwargs):
+    """
+    Copied from https://github.com/huggingface/transformers/blob/5523e38b553ff6c46b04d2376870fcd842feeecc/src/transformers/quantizers/quantizer_bnb_4bit.py#L68
+    Only difference is deletion of bitsandbytes version checks
+    """
+    if not is_accelerate_available():
+        raise ImportError(
+            f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
+        )
+    if not gaudi_is_bitsandbytes_available():
+        raise ImportError(
+            "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
+        )
+
+    bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
+    gaudi_validate_bnb_backend_availability(raise_exception=True)
+
+    if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+        raise ValueError(
+            "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
+            " sure the weights are in PyTorch format."
+        )
+
+    device_map = kwargs.get("device_map", None)
+    if (
+        device_map is not None
+        and isinstance(device_map, dict)
+        and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
+    ):
+        device_map_without_lm_head = {
+            key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
+        }
+        if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
+            pass
+        elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+            raise ValueError(
+                "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
+                "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
+                "in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to "
+                "`from_pretrained`. Check "
+                "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu "
+                "for more details. "
+            )
+
+
+def gaudi_create_quantized_param(
+    self,
+    model: "PreTrainedModel",
+    param_value: "torch.Tensor",
+    param_name: str,
+    target_device: "torch.device",
+    state_dict: Dict[str, Any],
+    unexpected_keys: Optional[List[str]] = None,
+):
+    """
+    Copied from https://github.com/huggingface/transformers/blob/62c60a30181a65e1a3a7f19c3055a240a6a21335/src/transformers/quantizers/quantizer_bnb_4bit.py#L138
+    only diiference is addition of HPU device
+    """
+    import bitsandbytes as bnb
+
+    module, tensor_name = get_module_from_name(model, param_name)
+
+    if tensor_name not in module._parameters:
+        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+    old_value = getattr(module, tensor_name)
+
+    if tensor_name == "bias":
+        if param_value is None:
+            new_value = old_value.to(target_device)
+        else:
+            new_value = param_value.to(target_device)
+
+        new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad)
+        module._parameters[tensor_name] = new_value
+        return
+
+    if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit):
+        raise ValueError("this function only loads `Linear4bit components`")
+    if (
+        old_value.device == torch.device("meta")
+        and target_device not in ["meta", torch.device("meta")]
+        and param_value is None
+    ):
+        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.")
+
+    # construct `new_value` for the module._parameters[tensor_name]:
+    if self.pre_quantized:
+        # 4bit loading. Collecting components for restoring quantized weight
+        # This can be expanded to make a universal call for any quantized weight loading
+
+        if not self.is_serializable:
+            raise ValueError(
+                "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. "
+                "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+            )
+
+        if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and (
+            param_name + ".quant_state.bitsandbytes__nf4" not in state_dict
+        ):
+            raise ValueError(
+                f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components."
+            )
+
+        quantized_stats = {}
+        for k, v in state_dict.items():
+            if param_name + "." in k:
+                quantized_stats[k] = v
+                if unexpected_keys is not None and k in unexpected_keys:
+                    unexpected_keys.remove(k)
+
+        param_kwargs = {}
+        if self.is_bnb_supports_quant_storage_module:
+            param_kwargs["module"] = module
+
+        new_value = bnb.nn.Params4bit.from_prequantized(
+            data=param_value,
+            quantized_stats=quantized_stats,
+            requires_grad=False,
+            device=target_device,
+            **param_kwargs,
+        )
+    else:
+        if target_device == "hpu":
+            new_value = param_value.to("hpu")
+        else:
+            new_value = param_value.to("cpu")
+
+        # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
+        # Since weights are saved in the correct "orientation", we skip transposing when loading.
+        if issubclass(module.source_cls, Conv1D):
+            new_value = new_value.T
+
+        kwargs = old_value.__dict__
+        new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
+
+    module._parameters[tensor_name] = new_value
diff --git a/optimum/habana/sentence_transformers/modeling_utils.py b/optimum/habana/sentence_transformers/modeling_utils.py
index 7690483e39..6be3065c32 100644
--- a/optimum/habana/sentence_transformers/modeling_utils.py
+++ b/optimum/habana/sentence_transformers/modeling_utils.py
@@ -19,8 +19,9 @@ def adapt_sentence_transformers_to_gaudi():
     Replaces some SentenceTransformer' methods for equivalent methods optimized
     for Gaudi.
     """
-
     from sentence_transformers import SentenceTransformer
+    from sentence_transformers.data_collator import SentenceTransformerDataCollator
+    from sentence_transformers.models import Transformer
 
     from optimum.habana.sentence_transformers import (
         st_gaudi_data_collator_call,
@@ -30,12 +31,6 @@ def adapt_sentence_transformers_to_gaudi():
     )
 
     SentenceTransformer.encode = st_gaudi_encode
-
-    from sentence_transformers.models import Transformer
-
     Transformer.tokenize = st_gaudi_transformer_tokenize
     Transformer.save = st_gaudi_transformer_save
-
-    from sentence_transformers.data_collator import SentenceTransformerDataCollator
-
     SentenceTransformerDataCollator.__call__ = st_gaudi_data_collator_call
diff --git a/optimum/habana/sentence_transformers/st_gaudi_data_collator.py b/optimum/habana/sentence_transformers/st_gaudi_data_collator.py
index 25e015fe24..51e823e1ae 100644
--- a/optimum/habana/sentence_transformers/st_gaudi_data_collator.py
+++ b/optimum/habana/sentence_transformers/st_gaudi_data_collator.py
@@ -5,47 +5,55 @@
 
 
 def st_gaudi_data_collator_call(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
-    """data collator for sentence transformer"""
+    """Collator for a SentenceTransformers model."""
 
-    columns = list(features[0].keys())
+    column_names = list(features[0].keys())
 
     # We should always be able to return a loss, label or not:
     batch = {"return_loss": True}
 
-    if "dataset_name" in columns:
-        columns.remove("dataset_name")
+    if "dataset_name" in column_names:
+        column_names.remove("dataset_name")
         batch["dataset_name"] = features[0]["dataset_name"]
 
+    if tuple(column_names) not in self._warned_columns:
+        self.maybe_warn_about_column_order(column_names)
+
     # Extract the label column if it exists
     for label_column in self.valid_label_columns:
-        if label_column in columns:
+        if label_column in column_names:
             batch["label"] = torch.tensor([row[label_column] for row in features])
-            columns.remove(label_column)
+            column_names.remove(label_column)
             break
 
     # Extract the feature columns
     cnt = 0
+    cnt1 = 0
     power2_len = [0, 0]
-    for column in columns:
-        tokenized = self.tokenize_fn([row[column] for row in features])
+    for column_name in column_names:
+        # If the prompt length has been set, we should add it to the batch
+        if column_name.endswith("_prompt_length") and column_name[: -len("_prompt_length")] in column_names:
+            batch[column_name] = torch.tensor([row[column_name] for row in features], dtype=torch.int)
+            continue
+
+        tokenized = self.tokenize_fn([row[column_name] for row in features])
         for key, value in tokenized.items():
             curr_tokenize_len = value.shape
             if curr_tokenize_len[1] > 4096:
-                power2_len[cnt % 2] = math.ceil(curr_tokenize_len[1] / 128) * 128
-                additional_pad_len = math.ceil(curr_tokenize_len[1] / 128) * 128 - curr_tokenize_len[1]
+                power2_len[cnt1] = math.ceil(curr_tokenize_len[1] / 128) * 128
             else:
-                power2_len[cnt % 2] = 2 ** math.ceil(math.log2(curr_tokenize_len[1]))
-                additional_pad_len = 2 ** math.ceil(math.log2(curr_tokenize_len[1])) - curr_tokenize_len[1]
-
-            if (cnt % 2 == 1) and (power2_len[0] == power2_len[1]):
-                additional_pad_len = additional_pad_len + 1
+                power2_len[cnt1] = 2 ** math.ceil(math.log2(curr_tokenize_len[1]))
+            additional_pad_len = power2_len[cnt1] - curr_tokenize_len[1]
+            if (cnt1 == 1) and (power2_len[0] == power2_len[1]):
+                additional_pad_len += 1
 
-            batch[f"{column}_{key}"] = torch.cat(
+            batch[f"{column_name}_{key}"] = torch.cat(
                 (
                     value,
                     torch.zeros((curr_tokenize_len[0], additional_pad_len), dtype=torch.int8),
                 ),
                 -1,
             )
-        cnt = cnt + 1
+        cnt += 1
+        cnt1 = cnt & 1
     return batch
diff --git a/optimum/habana/sentence_transformers/st_gaudi_encoder.py b/optimum/habana/sentence_transformers/st_gaudi_encoder.py
index db253953db..df8d06956c 100644
--- a/optimum/habana/sentence_transformers/st_gaudi_encoder.py
+++ b/optimum/habana/sentence_transformers/st_gaudi_encoder.py
@@ -5,13 +5,11 @@
 
 import numpy as np
 import torch
-from numpy import ndarray
 from sentence_transformers.quantization import quantize_embeddings
 from sentence_transformers.util import (
     batch_to_device,
     truncate_embeddings,
 )
-from torch import Tensor
 from tqdm.autonotebook import trange
 
 
@@ -24,14 +22,15 @@ def st_gaudi_encode(
     prompt_name: Optional[str] = None,
     prompt: Optional[str] = None,
     batch_size: int = 32,
-    show_progress_bar: bool = None,
+    show_progress_bar: Optional[bool] = None,
     output_value: Optional[Literal["sentence_embedding", "token_embeddings"]] = "sentence_embedding",
     precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
     convert_to_numpy: bool = True,
     convert_to_tensor: bool = False,
-    device: str = None,
+    device: Optional[str] = None,
     normalize_embeddings: bool = False,
-) -> Union[List[Tensor], ndarray, Tensor]:
+    **kwargs,
+) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
     """
     Computes sentence embeddings.
 
@@ -63,7 +62,7 @@ def st_gaudi_encode(
             the faster dot-product (util.dot_score) instead of cosine similarity can be used. Defaults to False.
 
     Returns:
-        Union[List[Tensor], ndarray, Tensor]: By default, a 2d numpy array with shape [num_inputs, output_dimension] is returned.
+        Union[List[torch.Tensor], np.ndarray, torch.Tensor]: By default, a 2d numpy array with shape [num_inputs, output_dimension] is returned.
         If only one string input is provided, then the output is a 1d array with shape [output_dimension]. If ``convert_to_tensor``,
         a torch Tensor is returned instead. If ``self.truncate_dim <= output_dimension`` then output_dimension is ``self.truncate_dim``.
 
@@ -85,9 +84,10 @@ def st_gaudi_encode(
             print(embeddings.shape)
             # (3, 768)
     """
+
     self.eval()
     if show_progress_bar is None:
-        show_progress_bar = logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+        show_progress_bar = logger.getEffectiveLevel() in (logging.INFO, logging.DEBUG)
 
     if convert_to_tensor:
         convert_to_numpy = False
@@ -119,6 +119,7 @@ def st_gaudi_encode(
                 "Encode with either a `prompt`, a `prompt_name`, or neither, but not both. "
                 "Ignoring the `prompt_name` in favor of `prompt`."
             )
+
     extra_features = {}
     if prompt is not None:
         sentences = [prompt + sentence for sentence in sentences]
@@ -132,6 +133,8 @@ def st_gaudi_encode(
     if device is None:
         device = self.device
 
+    self.to(device)
+
     all_embeddings = []
     length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
     sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
@@ -139,7 +142,6 @@ def st_gaudi_encode(
     for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
         sentences_batch = sentences_sorted[start_index : start_index + batch_size]
         features = self.tokenize(sentences_batch)
-
         if self.device.type == "hpu":
             if "input_ids" in features:
                 curr_tokenize_len = features["input_ids"].shape
@@ -166,11 +168,12 @@ def st_gaudi_encode(
                         ),
                         -1,
                     )
+
         features = batch_to_device(features, device)
         features.update(extra_features)
 
         with torch.no_grad():
-            out_features = self.forward(features)
+            out_features = self.forward(features, **kwargs)
             if self.device.type == "hpu":
                 out_features = copy.deepcopy(out_features)
 
@@ -218,7 +221,7 @@ def st_gaudi_encode(
             all_embeddings = torch.Tensor()
     elif convert_to_numpy:
         if not isinstance(all_embeddings, np.ndarray):
-            if all_embeddings[0].dtype == torch.bfloat16:
+            if all_embeddings and all_embeddings[0].dtype == torch.bfloat16:
                 all_embeddings = np.asarray([emb.float().numpy() for emb in all_embeddings])
             else:
                 all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
diff --git a/optimum/habana/sentence_transformers/st_gaudi_trainer.py b/optimum/habana/sentence_transformers/st_gaudi_trainer.py
index 6f852ac924..f7d73d231c 100644
--- a/optimum/habana/sentence_transformers/st_gaudi_trainer.py
+++ b/optimum/habana/sentence_transformers/st_gaudi_trainer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,18 +12,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
 import logging
 import os
-import warnings
+from collections import OrderedDict
 from contextlib import nullcontext
+from functools import partial
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from accelerate.utils import DistributedDataParallelKwargs
+from packaging.version import parse as parse_version
 from sentence_transformers.data_collator import SentenceTransformerDataCollator
 from sentence_transformers.evaluation import SentenceEvaluator, SequentialEvaluator
 from sentence_transformers.losses.CoSENTLoss import CoSENTLoss
 from sentence_transformers.model_card import ModelCardCallback
+from sentence_transformers.models import Pooling
 from sentence_transformers.models.Transformer import Transformer
 from sentence_transformers.sampler import (
     DefaultBatchSampler,
@@ -39,21 +44,19 @@
 from sentence_transformers.util import disable_logging, is_datasets_available
 from torch.utils.data import BatchSampler, ConcatDataset, DataLoader, SubsetRandomSampler
 from transformers import EvalPrediction, PreTrainedTokenizerBase, TrainerCallback
+from transformers import __version__ as transformers_version
 from transformers.data.data_collator import DataCollator
 from transformers.integrations import WandbCallback
-from transformers.modeling_utils import unwrap_model
-from transformers.trainer import TRAINING_ARGS_NAME
+from transformers.trainer import TRAINING_ARGS_NAME, _is_peft_model
 from transformers.trainer_utils import EvalLoopOutput
 from transformers.training_args import ParallelMode
 
-from optimum.habana.transformers.trainer import _is_peft_model
-
 from ..transformers import GaudiConfig, GaudiTrainer
 from .st_gaudi_training_args import SentenceTransformerGaudiTrainingArguments
 
 
 if is_datasets_available():
-    from datasets import Dataset, DatasetDict
+    from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict, Value
 
 logger = logging.getLogger(__name__)
 
@@ -63,33 +66,58 @@
 
 class SentenceTransformerGaudiTrainer(GaudiTrainer):
     """
-    Inherits from GaudiTrainer and adapted from: https://github.com/UKPLab/sentence-transformers/blob/v3.0.1/sentence_transformers/trainer.py
+    SentenceTransformerGaudiTrainer is a simple but feature-complete training and eval loop for PyTorch
+    based on the 🤗 Transformers :class:`~transformers.Trainer`.
+
+    It inherits from GaudiTrainer and adapted from:
+        https://github.com/UKPLab/sentence-transformers/blob/v3.3.1/sentence_transformers/trainer.py
     """
 
     def __init__(
         self,
-        model: Optional["SentenceTransformer"] = None,
-        gaudi_config: GaudiConfig = None,
-        args: SentenceTransformerGaudiTrainingArguments = None,
-        train_dataset: Optional[Union["Dataset", "DatasetDict", Dict[str, "Dataset"]]] = None,
-        eval_dataset: Optional[Union["Dataset", "DatasetDict", Dict[str, "Dataset"]]] = None,
+        model: Optional[SentenceTransformer] = None,
+        gaudi_config: Optional[GaudiConfig] = None,
+        args: Optional[SentenceTransformerGaudiTrainingArguments] = None,
+        train_dataset: Optional[Union[Dataset, DatasetDict, IterableDataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, DatasetDict, IterableDataset, Dict[str, Dataset]]] = None,
         loss: Optional[
             Union[
                 torch.nn.Module,
                 Dict[str, torch.nn.Module],
-                Callable[["SentenceTransformer"], torch.nn.Module],
-                Dict[str, Callable[["SentenceTransformer"], torch.nn.Module]],
+                Callable[[SentenceTransformer], torch.nn.Module],
+                Dict[str, Callable[[SentenceTransformer], torch.nn.Module]],
             ]
         ] = None,
         evaluator: Optional[Union[SentenceEvaluator, List[SentenceEvaluator]]] = None,
         data_collator: Optional[DataCollator] = None,
         tokenizer: Optional[Union[PreTrainedTokenizerBase, Callable]] = None,
-        model_init: Optional[Callable[[], "SentenceTransformer"]] = None,
+        model_init: Optional[Callable[[], SentenceTransformer]] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        optimizers: Tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ) -> None:
+        if not is_datasets_available():
+            raise RuntimeError(
+                "To train a SentenceTransformerGaudiTrainer model, you need to install the `datasets` module. "
+                "To fix: pip install datasets"
+            )
+
+        if args is None:
+            output_dir = "tmp_trainer"
+            logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
+            args = SentenceTransformerGaudiTrainingArguments(
+                output_dir=output_dir,
+                use_habana=True,
+                gaudi_config_name="Habana/distilbert-base-uncased",
+                use_lazy_mode=True,
+                use_hpu_graphs=True,
+                use_hpu_graphs_for_inference=False,
+                use_hpu_graphs_for_training=True,
+            )
+        elif not isinstance(args, SentenceTransformerGaudiTrainingArguments):
+            raise ValueError("Please use `TrainingArguments` imported from `optimum.habana.sentence_transformers`.")
+
         if model is None:
             if model_init is not None:
                 self.model_init = model_init
@@ -98,14 +126,19 @@ def __init__(
                 raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")
         else:
             if model_init is not None:
-                warnings.warn(
+                logger.warning(
                     "`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will"
-                    " overwrite your model when calling the `train` method. This will become a fatal error in the next"
-                    " release.",
-                    FutureWarning,
+                    " overwrite your model when calling the `train` method."
                 )
             self.model_init = model_init
 
+        if compute_metrics is not None:
+            logger.warning(
+                "`compute_metrics` is currently not compatible with the SentenceTransformerGaudiTrainer. Please use the "
+                "`evaluator` argument instead for detailed evaluation metrics, or the `eval_dataset` argument for "
+                "the evaluation loss."
+            )
+
         # Get a dictionary of the default training arguments, so we can determine which arguments have been changed
         # for the model card
         default_args_dict = SentenceTransformerGaudiTrainingArguments(
@@ -118,8 +151,8 @@ def __init__(
             use_hpu_graphs_for_training=True,
         ).to_dict()
 
-        # If the model ID is set via the SentenceTransformerTrainingArguments, but not via the SentenceTransformerModelCardData,
-        # then we can set it here for the model card regardless
+        # If the model ID is set via the SentenceTransformerGaudiTrainingArguments, but not via the
+        # SentenceTransformerModelCardData, then we can set it here for the model card regardless
         if args.hub_model_id and not model.model_card_data.model_id:
             model.model_card_data.set_model_id(args.hub_model_id)
 
@@ -129,30 +162,57 @@ def __init__(
         if data_collator is None:
             data_collator = SentenceTransformerDataCollator(tokenize_fn=model.tokenize)
 
+        for dataset_name, dataset in zip(["train", "eval"], [train_dataset, eval_dataset]):
+            if isinstance(dataset, IterableDataset) and dataset.column_names is None:
+                sample = next(iter(dataset))
+                naive_type_mapping = {str: "string", int: "int64", float: "float32", bool: "bool"}
+                example_features = {
+                    key: Value(naive_type_mapping.get(type(value), "null")) for key, value in sample.items()
+                }
+                raise ValueError(
+                    f"The provided `{dataset_name}_dataset` must have Features. Specify them with e.g.:\n"
+                    f"{dataset_name}_dataset = {dataset_name}_dataset.cast(Features({example_features}))\n"
+                    "or by providing the Features to the IterableDataset initialization method. See the Datasets "
+                    "documentation for more information on dataset Features: "
+                    "https://huggingface.co/docs/datasets/en/about_dataset_features"
+                )
+
         if isinstance(train_dataset, dict) and not isinstance(train_dataset, DatasetDict):
             train_dataset = DatasetDict(train_dataset)
-        if isinstance(eval_dataset, dict) and not isinstance(eval_dataset, Dataset):
+        if isinstance(eval_dataset, dict) and not isinstance(eval_dataset, DatasetDict):
             eval_dataset = DatasetDict(eval_dataset)
+        super_kwargs = {
+            "model": None if self.model_init else model,
+            "gaudi_config": gaudi_config,
+            "args": args,
+            "data_collator": data_collator,
+            "train_dataset": train_dataset,
+            "eval_dataset": eval_dataset if eval_dataset is not None or evaluator is None else "dummy",
+            "model_init": model_init,
+            "compute_metrics": compute_metrics,
+            "callbacks": callbacks,
+            "optimizers": optimizers,
+            "preprocess_logits_for_metrics": preprocess_logits_for_metrics,
+        }
+        # Transformers v4.46.0 changed the `tokenizer` argument to a more general `processing_class` argument
+        if parse_version(transformers_version) >= parse_version("4.46.0"):
+            super_kwargs["processing_class"] = tokenizer
+        else:
+            super_kwargs["tokenizer"] = tokenizer
+        super().__init__(**super_kwargs)
 
-        super().__init__(
-            model=None if self.model_init else model,
-            gaudi_config=gaudi_config,
-            args=args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
-            model_init=model_init,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
+        # Transformers v4.46.0 introduced a ValueError if `eval_dataset` is None while eval_strategy is not "no",
+        # but in Sentence Transformers you can also evaluate without an eval_dataset via an evaluator, so we set
+        # it to "dummy" in that case to avoid the ValueError
+        if self.eval_dataset == "dummy":
+            self.eval_dataset = None
 
         # Every Sentence Transformer model can always return a loss, so we set this to True
         # to avoid having to specify it in the data collator or model's forward
         self.can_return_loss = True
 
+        self._prompt_length_mapping = {}
+
         self.model: SentenceTransformer
         self.args: SentenceTransformerGaudiTrainingArguments
         self.data_collator: SentenceTransformerDataCollator
@@ -180,12 +240,38 @@ def __init__(
                     )
         else:
             self.loss = self.prepare_loss(loss, model)
+
         # If evaluator is a list, we wrap it in a SequentialEvaluator
         if evaluator is not None and not isinstance(evaluator, SentenceEvaluator):
             evaluator = SequentialEvaluator(evaluator)
         self.evaluator = evaluator
 
-        # Add a callback responsible for automatically tracking data required for the automatic model card generation
+        if self.train_dataset is not None:
+            self.train_dataset = self.maybe_add_prompts_or_dataset_name_column(
+                train_dataset, args.prompts, dataset_name="train"
+            )
+        if self.eval_dataset is not None:
+            self.eval_dataset = self.maybe_add_prompts_or_dataset_name_column(
+                eval_dataset, args.prompts, dataset_name="eval"
+            )
+        self.add_model_card_callback(default_args_dict)
+
+    def add_model_card_callback(self, default_args_dict: dict[str, Any]) -> None:
+        """
+        Add a callback responsible for automatically tracking data required for the automatic model card generation
+
+        This method is called in the ``__init__`` method of the
+        :class:`~sentence_transformers.trainer.SentenceTransformerTrainer` class.
+
+        Args:
+            default_args_dict (Dict[str, Any]): A dictionary of the default training arguments, so we can determine
+                which arguments have been changed for the model card.
+
+        .. note::
+
+            This method can be overriden by subclassing the trainer to remove/customize this callback in custom uses cases
+        """
+
         model_card_callback = ModelCardCallback(self, default_args_dict)
         self.add_callback(model_card_callback)
         model_card_callback.on_init_end(self.args, self.state, self.control, self.model)
@@ -196,7 +282,7 @@ def _wrap_model(self, model, training=True, dataloader=None):
         - `allow_unused_input=True` was added to `ht.hpu.ModuleCacher()`
         """
         # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
-        if unwrap_model(model) is not model:
+        if self.accelerator.unwrap_model(model) is not model:
             return model
 
         # Note: in torch.distributed mode, there's no point in wrapping the model
@@ -234,7 +320,7 @@ def _wrap_model(self, model, training=True, dataloader=None):
 
         return model
 
-    def call_model_init(self, trial=None) -> "SentenceTransformer":
+    def call_model_init(self, trial=None) -> SentenceTransformer:
         model = super().call_model_init(trial=trial)
         # If the Trainer already has a loss, then we'll want to override the model in the loss function
         if not hasattr(self, "loss"):
@@ -259,7 +345,7 @@ def call_model_init(self, trial=None) -> "SentenceTransformer":
             self.loss = self.override_model_in_loss(self.loss, model)
         return model
 
-    def override_model_in_loss(self, loss: torch.nn.Module, model: "SentenceTransformer") -> torch.nn.Module:
+    def override_model_in_loss(self, loss: torch.nn.Module, model: SentenceTransformer) -> torch.nn.Module:
         from sentence_transformers import SentenceTransformer
 
         for name, child in loss.named_children():
@@ -273,14 +359,14 @@ def override_model_in_loss(self, loss: torch.nn.Module, model: "SentenceTransfor
 
     def prepare_loss(
         self,
-        loss: Union[Callable[["SentenceTransformer"], torch.nn.Module], torch.nn.Module],
-        model: "SentenceTransformer",
+        loss: Union[Callable[[SentenceTransformer], torch.nn.Module], torch.nn.Module],
+        model: SentenceTransformer,
     ) -> torch.nn.Module:
         if isinstance(loss, torch.nn.Module):
             return loss.to(model.device)
         return loss(model).to(model.device)
 
-    def add_dataset_name_column(self, dataset_dict: "DatasetDict") -> "DatasetDict":
+    def add_dataset_name_column(self, dataset_dict: DatasetDict) -> DatasetDict:
         for key, dataset in dataset_dict.items():
             if "dataset_name" not in dataset.column_names:
                 dataset_dict[key] = dataset.add_column("dataset_name", [key] * len(dataset))
@@ -288,9 +374,10 @@ def add_dataset_name_column(self, dataset_dict: "DatasetDict") -> "DatasetDict":
 
     def compute_loss(
         self,
-        model: "SentenceTransformer",
+        model: SentenceTransformer,
         inputs: Dict[str, Union[torch.Tensor, Any]],
         return_outputs: bool = False,
+        num_items_in_batch: Optional[int] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, Any]]]:
         """
         Computes the loss for the SentenceTransformer model.
@@ -305,6 +392,7 @@ def compute_loss(
             model (SentenceTransformer): The SentenceTransformer model.
             inputs (Dict[str, Union[torch.Tensor, Any]]): The input data for the model.
             return_outputs (bool, optional): Whether to return the outputs along with the loss. Defaults to False.
+            num_items_in_batch (int, optional): The number of items in the batch. Defaults to None. Unused, but required by the transformers Trainer.
 
         Returns:
             Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, Any]]]: The computed loss. If `return_outputs` is True, returns a tuple of loss and outputs. Otherwise, returns only the loss.
@@ -318,7 +406,6 @@ def compute_loss(
 
         # Insert the wrapped (e.g. distributed or compiled) model into the loss function,
         # if the loss stores the model. Only called once per process
-        # from https://github.com/UKPLab/sentence-transformers/blob/v3.1.0/sentence_transformers/trainer.py#L337
         if (
             model == self.model_wrapped
             and model != self.model  # Only if the model is wrapped
@@ -330,7 +417,7 @@ def compute_loss(
         if return_outputs:
             # During prediction/evaluation, `compute_loss` will be called with `return_outputs=True`.
             # However, Sentence Transformer losses do not return outputs, so we return an empty dictionary.
-            # This does not result in any problems, as the SentenceTransformerTrainingArguments sets
+            # This does not result in any problems, as the SentenceTransformerGaudiTrainingArguments sets
             # `prediction_loss_only=True` which means that the output is not used.
             return loss, {}
         return loss
@@ -372,13 +459,16 @@ def collect_features(
 
     def evaluate(
         self,
-        eval_dataset: Optional[Union["Dataset", Dict[str, "Dataset"]]] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
         ignore_keys: Optional[List[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> Dict[str, float]:
-        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
-        if isinstance(eval_dataset, DatasetDict) and isinstance(self.loss, dict):
-            eval_dataset = self.add_dataset_name_column(eval_dataset)
+        if eval_dataset is not None:
+            eval_dataset = self.maybe_add_prompts_or_dataset_name_column(
+                eval_dataset, self.args.prompts, dataset_name="eval"
+            )
+        else:
+            eval_dataset = self.eval_dataset
         return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
 
     def evaluation_loop(
@@ -438,7 +528,7 @@ def _load_best_model(self) -> None:
         except Exception:
             pass
 
-        # Override the model with the `tranformers`-based auto_model, and restore the original SentenceTransformers
+        # Override the model with the `transformers`-based auto_model, and restore the original SentenceTransformers
         # model with the loaded `transformers` model
         full_model = self.model
         self.model = self.model[0].auto_model
@@ -449,7 +539,12 @@ def _load_best_model(self) -> None:
             self.model = full_model
             self.model[0].auto_model = loaded_auto_model
 
-    def validate_column_names(self, dataset: "Dataset", dataset_name: Optional[str] = None) -> bool:
+    def validate_column_names(self, dataset: Dataset, dataset_name: Optional[str] = None) -> None:
+        if isinstance(dataset, dict):
+            for dataset_name, dataset in dataset.items():
+                self.validate_column_names(dataset, dataset_name=dataset_name)
+            return
+
         if overlap := set(dataset.column_names) & {"return_loss", "dataset_name"}:
             raise ValueError(
                 f"The following column names are invalid in your {dataset_name + ' ' if dataset_name else ''}dataset: {list(overlap)}."
@@ -458,12 +553,36 @@ def validate_column_names(self, dataset: "Dataset", dataset_name: Optional[str]
 
     def get_batch_sampler(
         self,
-        dataset: "Dataset",
+        dataset: Dataset,
         batch_size: int,
         drop_last: bool,
         valid_label_columns: Optional[List[str]] = None,
         generator: Optional[torch.Generator] = None,
-    ) -> BatchSampler:
+    ) -> Optional[BatchSampler]:
+        """
+        Returns the appropriate batch sampler based on the ``batch_sampler`` argument in ``self.args``.
+        This batch sampler class supports ``__len__`` and ``__iter__`` methods, and is used as the ``batch_sampler``
+        to create the :class:`torch.utils.data.DataLoader`.
+
+        .. note::
+            Override this method to provide a custom batch sampler.
+
+        Args:
+            dataset (Dataset): The dataset to sample from.
+            batch_size (int): Number of samples per batch.
+            drop_last (bool): If True, drop the last incomplete batch if the dataset size
+                is not divisible by the batch size.
+            valid_label_columns (List[str]): List of column names to check for labels.
+                The first column name from ``valid_label_columns`` found in the dataset will
+                be used as the label column.
+            generator (torch.Generator, optional): Optional random number generator for shuffling
+                the indices.
+        """
+        if isinstance(dataset, IterableDataset):
+            if self.args.batch_sampler != BatchSamplers.BATCH_SAMPLER:
+                logger.warning("When using an IterableDataset, you cannot specify a batch sampler.")
+            return None
+
         if self.args.batch_sampler == BatchSamplers.NO_DUPLICATES:
             return NoDuplicatesBatchSampler(
                 dataset=dataset,
@@ -491,10 +610,24 @@ def get_batch_sampler(
     def get_multi_dataset_batch_sampler(
         self,
         dataset: ConcatDataset,
-        batch_samplers: List[BatchSampler],
+        batch_samplers: list[BatchSampler],
         generator: Optional[torch.Generator] = None,
         seed: Optional[int] = 0,
     ) -> BatchSampler:
+        """
+        Returns the appropriate multi-dataset batch sampler based on the ``multi_dataset_batch_sampler`` argument
+        in ``self.args``. This batch sampler class supports ``__len__`` and ``__iter__`` methods, and is used as the
+        ``batch_sampler`` to create the :class:`torch.utils.data.DataLoader`.
+
+        .. note::
+            Override this method to provide a custom multi-dataset batch sampler.
+
+        Args:
+            dataset (ConcatDataset): The concatenation of all datasets.
+            batch_samplers (List[BatchSampler]): List of batch samplers for each dataset in the concatenated dataset.
+            generator (torch.Generator, optional): Optional random number generator for shuffling the indices.
+            seed (int, optional): Optional seed for the random number generator
+        """
         if self.args.multi_dataset_batch_sampler == MultiDatasetBatchSamplers.ROUND_ROBIN:
             return RoundRobinBatchSampler(
                 dataset=dataset,
@@ -521,7 +654,7 @@ def get_train_dataloader(self) -> DataLoader:
         Subclass and override this method if you want to inject some custom behavior.
         """
         if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")
+            raise ValueError("Training requires specifying a train_dataset to the SentenceTransformerGaudiTrainer.")
 
         train_dataset = self.train_dataset
         data_collator = self.data_collator
@@ -530,15 +663,40 @@ def get_train_dataloader(self) -> DataLoader:
         if self.args.seed:
             generator.manual_seed(self.args.seed)
 
-        if isinstance(train_dataset, DatasetDict):
-            for dataset_name, dataset in train_dataset.items():
-                self.validate_column_names(dataset, dataset_name=dataset_name)
-            if isinstance(self.loss, dict):
-                train_dataset = self.add_dataset_name_column(train_dataset)
+        dataloader_params = {
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+            "prefetch_factor": self.args.dataloader_prefetch_factor,
+        }
+
+        if isinstance(train_dataset, IterableDataset):
+            dataloader_params.update(
+                {
+                    "batch_size": self.args.train_batch_size,
+                    "drop_last": self.args.dataloader_drop_last,
+                }
+            )
+            if self.args.batch_sampler != BatchSamplers.BATCH_SAMPLER:
+                logger.warning("When using an IterableDataset, you cannot specify a batch sampler.")
+
+        elif isinstance(train_dataset, IterableDatasetDict):
+            raise ValueError(
+                "Sentence Transformers is not compatible with IterableDatasetDict. Please use a DatasetDict instead."
+            )
+
+        elif isinstance(train_dataset, DatasetDict):
+            for dataset in train_dataset.values():
+                if isinstance(dataset, IterableDataset):
+                    raise ValueError(
+                        "Sentence Transformers is not compatible with a DatasetDict containing an IterableDataset."
+                    )
+
             batch_samplers = [
                 self.get_batch_sampler(
                     dataset,
-                    batch_size=self.args.per_device_train_batch_size,
+                    batch_size=self.args.train_batch_size,
                     drop_last=self.args.dataloader_drop_last,
                     valid_label_columns=data_collator.valid_label_columns,
                     generator=generator,
@@ -553,10 +711,9 @@ def get_train_dataloader(self) -> DataLoader:
                 generator=generator,
                 seed=self.args.seed,
             )
+            dataloader_params["batch_sampler"] = batch_sampler
 
-        else:
-            self.validate_column_names(train_dataset)
-
+        elif isinstance(train_dataset, Dataset):
             batch_sampler = self.get_batch_sampler(
                 train_dataset,
                 batch_size=self.args.train_batch_size,
@@ -564,15 +721,11 @@ def get_train_dataloader(self) -> DataLoader:
                 valid_label_columns=data_collator.valid_label_columns,
                 generator=generator,
             )
-
-        dataloader_params = {
-            "collate_fn": data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-            "persistent_workers": self.args.dataloader_persistent_workers,
-            "prefetch_factor": self.args.dataloader_prefetch_factor,
-            "batch_sampler": batch_sampler,
-        }
+            dataloader_params["batch_sampler"] = batch_sampler
+        else:
+            raise ValueError(
+                "Unsupported `train_dataset` type. Use a Dataset, DatasetDict, or IterableDataset for training."
+            )
 
         # If 'even_batches' is True, it will use the initial few samples to pad out the last sample. This can
         # cause issues with multi-dataset training, so we want to set this to False.
@@ -581,7 +734,9 @@ def get_train_dataloader(self) -> DataLoader:
         self._train_dataloader = self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
         return self._train_dataloader
 
-    def get_eval_dataloader(self, eval_dataset: Union["Dataset", None] = None) -> DataLoader:
+    def get_eval_dataloader(
+        self, eval_dataset: Optional[Union[Dataset, DatasetDict, IterableDataset]] = None
+    ) -> DataLoader:
         """
         Returns the evaluation [`~torch.utils.data.DataLoader`].
 
@@ -596,7 +751,8 @@ def get_eval_dataloader(self, eval_dataset: Union["Dataset", None] = None) -> Da
             # Prevent errors if the evaluator is set but no eval_dataset is provided
             if self.evaluator is not None:
                 return DataLoader([])
-            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+            raise ValueError("Evaluation requires specifying an eval_dataset to the SentenceTransformerGaudiTrainer.")
+
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
         data_collator = self.data_collator
 
@@ -604,14 +760,37 @@ def get_eval_dataloader(self, eval_dataset: Union["Dataset", None] = None) -> Da
         if self.args.seed:
             generator.manual_seed(self.args.seed)
 
-        # TODO: Correctly validate the column names for the eval_dataset
-        if isinstance(eval_dataset, DatasetDict):
-            if isinstance(self.loss, dict):
-                eval_dataset = self.add_dataset_name_column(eval_dataset)
+        dataloader_params = {
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+            "prefetch_factor": self.args.dataloader_prefetch_factor,
+        }
+        if isinstance(eval_dataset, IterableDataset):
+            dataloader_params.update(
+                {
+                    "batch_size": self.args.eval_batch_size,
+                    "drop_last": self.args.dataloader_drop_last,
+                }
+            )
+
+        elif isinstance(eval_dataset, IterableDatasetDict):
+            raise ValueError(
+                "Sentence Transformers is not compatible with IterableDatasetDict. Please use a DatasetDict instead."
+            )
+
+        elif isinstance(eval_dataset, DatasetDict):
+            for dataset in eval_dataset.values():
+                if isinstance(dataset, IterableDataset):
+                    raise ValueError(
+                        "Sentence Transformers is not compatible with a DatasetDict containing an IterableDataset."
+                    )
+
             batch_samplers = [
                 self.get_batch_sampler(
                     dataset,
-                    batch_size=self.args.per_device_eval_batch_size,
+                    batch_size=self.args.eval_batch_size,
                     drop_last=self.args.dataloader_drop_last,
                     valid_label_columns=data_collator.valid_label_columns,
                     generator=generator,
@@ -626,23 +805,22 @@ def get_eval_dataloader(self, eval_dataset: Union["Dataset", None] = None) -> Da
                 generator=generator,
                 seed=self.args.seed,
             )
-        else:
+            dataloader_params["batch_sampler"] = batch_sampler
+
+        elif isinstance(eval_dataset, Dataset):
             batch_sampler = self.get_batch_sampler(
                 eval_dataset,
-                batch_size=self.args.train_batch_size,
+                batch_size=self.args.eval_batch_size,
                 drop_last=self.args.dataloader_drop_last,
                 valid_label_columns=data_collator.valid_label_columns,
                 generator=generator,
             )
+            dataloader_params["batch_sampler"] = batch_sampler
 
-        dataloader_params = {
-            "collate_fn": data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-            "persistent_workers": self.args.dataloader_persistent_workers,
-            "prefetch_factor": self.args.dataloader_prefetch_factor,
-            "batch_sampler": batch_sampler,
-        }
+        else:
+            raise ValueError(
+                "Unsupported `eval_dataset` type. Use a Dataset, DatasetDict, or IterableDataset for evaluation."
+            )
 
         # If 'even_batches' is True, it will use the initial few samples to pad out the last sample. This can
         # cause issues with multi-dataset training, so we want to set this to False during training.
@@ -650,7 +828,7 @@ def get_eval_dataloader(self, eval_dataset: Union["Dataset", None] = None) -> Da
         self.accelerator.even_batches = True
         return self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
 
-    def get_test_dataloader(self, test_dataset: "Dataset") -> DataLoader:
+    def get_test_dataloader(self, test_dataset: Union[Dataset, DatasetDict, IterableDataset]) -> DataLoader:
         """
         Returns the training [`~torch.utils.data.DataLoader`].
 
@@ -667,15 +845,38 @@ def get_test_dataloader(self, test_dataset: "Dataset") -> DataLoader:
         if self.args.seed:
             generator.manual_seed(self.args.seed)
 
-        if isinstance(test_dataset, DatasetDict):
-            for dataset_name, dataset in test_dataset.items():
-                self.validate_column_names(dataset, dataset_name=dataset_name)
-            if isinstance(self.loss, dict):
-                test_dataset = self.add_dataset_name_column(test_dataset)
+        dataloader_params = {
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+            "prefetch_factor": self.args.dataloader_prefetch_factor,
+        }
+
+        if isinstance(test_dataset, IterableDataset):
+            dataloader_params.update(
+                {
+                    "batch_size": self.args.eval_batch_size,
+                    "drop_last": self.args.dataloader_drop_last,
+                }
+            )
+
+        elif isinstance(test_dataset, IterableDatasetDict):
+            raise ValueError(
+                "Sentence Transformers is not compatible with IterableDatasetDict. Please use a DatasetDict instead."
+            )
+
+        elif isinstance(test_dataset, DatasetDict):
+            for dataset in test_dataset.values():
+                if isinstance(dataset, IterableDataset):
+                    raise ValueError(
+                        "Sentence Transformers is not compatible with a DatasetDict containing an IterableDataset."
+                    )
+
             batch_samplers = [
                 self.get_batch_sampler(
                     dataset,
-                    batch_size=self.args.per_device_train_batch_size,
+                    batch_size=self.args.eval_batch_size,
                     drop_last=self.args.dataloader_drop_last,
                     valid_label_columns=data_collator.valid_label_columns,
                     generator=generator,
@@ -690,33 +891,28 @@ def get_test_dataloader(self, test_dataset: "Dataset") -> DataLoader:
                 generator=generator,
                 seed=self.args.seed,
             )
+            dataloader_params["batch_sampler"] = batch_sampler
 
-        else:
-            self.validate_column_names(test_dataset)
-
+        elif isinstance(test_dataset, Dataset):
             batch_sampler = self.get_batch_sampler(
                 test_dataset,
-                batch_size=self.args.train_batch_size,
+                batch_size=self.args.eval_batch_size,
                 drop_last=self.args.dataloader_drop_last,
                 valid_label_columns=data_collator.valid_label_columns,
                 generator=generator,
             )
+            dataloader_params["batch_sampler"] = batch_sampler
 
-        dataloader_params = {
-            "collate_fn": data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-            "persistent_workers": self.args.dataloader_persistent_workers,
-            "prefetch_factor": self.args.dataloader_prefetch_factor,
-            "batch_sampler": batch_sampler,
-        }
+        else:
+            raise ValueError(
+                "Unsupported `test_dataset` type. Use a Dataset, DatasetDict, or IterableDataset for testing."
+            )
 
         # If 'even_batches' is True, it will use the initial few samples to pad out the last sample. This can
-        # cause issues with multi-dataset training, so we want to set this to False.
-        # For evaluation, setting 'even_batches' to False results in hanging, so we keep it as True there.
-        self.accelerator.even_batches = False
-        self._train_dataloader = self.accelerator.prepare(DataLoader(test_dataset, **dataloader_params))
-        return self._train_dataloader
+        # cause issues with multi-dataset training, so we want to set this to False during training.
+        # For evaluation, setting 'even_batches' to False results in hanging, so we keep it as True here.
+        self.accelerator.even_batches = True
+        return self.accelerator.prepare(DataLoader(test_dataset, **dataloader_params))
 
     def _save(self, output_dir: Optional[str] = None, state_dict=None) -> None:
         # If we are executing this function, we are the process zero, so we don't check for that.
@@ -726,8 +922,13 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None) -> None:
 
         self.model.save_pretrained(output_dir, safe_serialization=self.args.save_safetensors)
 
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(output_dir)
+        # Transformers v4.46.0 changed the `tokenizer` attribute to a more general `processing_class` attribute
+        if parse_version(transformers_version) >= parse_version("4.46.0"):
+            if self.processing_class is not None:
+                self.processing_class.save_pretrained(output_dir)
+        else:
+            if self.tokenizer is not None:
+                self.tokenizer.save_pretrained(output_dir)
 
         # Good practice: save your training arguments together with the trained model
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
@@ -735,20 +936,257 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None) -> None:
     def _load_from_checkpoint(self, checkpoint_path: str) -> None:
         from sentence_transformers import SentenceTransformer
 
-        loaded_model = SentenceTransformer(checkpoint_path)
+        loaded_model = SentenceTransformer(checkpoint_path, trust_remote_code=self.model.trust_remote_code)
         self.model.load_state_dict(loaded_model.state_dict())
 
+    def _get_prompt_length(self, prompt: str) -> int:
+        try:
+            return self._prompt_length_mapping[prompt]
+        except KeyError:
+            prompt_length = self.model.tokenize([prompt])["input_ids"].shape[-1] - 1
+            self._prompt_length_mapping[prompt] = prompt_length
+            return prompt_length
+
+    def _include_prompt_length(self) -> bool:
+        """
+        Return whether the prompt length should be passed to the model's forward method.
+
+        True if the model does not include the prompt in the pooling layer. Can be
+        overridden by the user if it's useful to include the prompt length.
+        """
+        for module in self.model:
+            if isinstance(module, Pooling):
+                return not module.include_prompt
+        return False
+
+    @staticmethod
+    def add_prompts_or_dataset_name_transform(
+        batch: Dict[str, List[Any]],
+        prompts: Optional[Union[Dict[str, str], str]] = None,
+        prompt_lengths: Optional[Union[Dict[str, int], int]] = None,
+        dataset_name: Optional[str] = None,
+        transform: Optional[Callable[[Dict[str, List[Any]]], Dict[str, List[Any]]]] = None,
+        **kwargs,
+    ) -> Dict[str, List[Any]]:
+        """A transform/map function that adds prompts or dataset names to the batch.
+
+        Args:
+            batch (dict[str, list[Any]]): The batch of data, where each key is a column name and each value
+                is a list of values.
+            prompts (dict[str, str] | str | None, optional): An optional mapping of column names to string
+                prompts, or a string prompt for all columns. Defaults to None.
+            prompt_lengths (dict[str, int] | int | None, optional): An optional mapping of prompts names to
+                prompt token length, or a prompt token length if the prompt is a string. Defaults to None.
+            dataset_name (str | None, optional): The name of this dataset, only if there are multiple datasets
+                that use a different loss. Defaults to None.
+            transform (Callable[[dict[str, list[Any]]], dict[str, list[Any]]], optional): An optional transform
+                function to apply on the batch before adding prompts, etc. Defaults to None.
+
+        Returns:
+            dict[str, list[Any]]: The "just-in-time" transformed batch with prompts and/or dataset names added.
+        """
+        # If the dataset is a Dataset(Dict), then we use set_transform and we want to also apply any
+        # previous transform if it exists
+        if transform:
+            batch = transform(batch)
+
+        # Return if the batch has no columns...
+        if not batch:
+            return batch
+
+        # ... or if it's empty
+        first_column = list(batch.keys())[0]
+        if not batch[first_column]:
+            return batch
+
+        # Apply one prompt to all columns...
+        if isinstance(prompts, str):
+            for column_name, column in list(batch.items()):
+                if isinstance(column[0], str):
+                    batch[column_name] = [prompts + value for value in column]
+
+                    if prompt_lengths is not None:
+                        batch[f"{column_name}_prompt_length"] = [prompt_lengths] * len(column)
+
+        # ... or a column-specific prompt
+        if isinstance(prompts, dict):
+            for column_name, prompt in prompts.items():
+                if column_name in batch:
+                    batch[column_name] = [prompt + value for value in batch[column_name]]
+
+                    if prompt_lengths:
+                        batch[f"{column_name}_prompt_length"] = [prompt_lengths[prompt]] * len(batch[column_name])
+
+        # If we have multiple losses, then we need to add the dataset name to the batch
+        if dataset_name:
+            batch["dataset_name"] = [dataset_name] * len(batch[first_column])
+
+        return batch
+
+    def maybe_add_prompts_or_dataset_name_column(
+        self,
+        dataset_dict: Union[DatasetDict, Dataset, None],
+        prompts: Optional[Union[Dict[str, Dict[str, str]], Dict[str, str], str]] = None,
+        dataset_name: Optional[str] = None,
+    ) -> Union[DatasetDict, Dataset, None]:
+        """
+        Maybe add prompts or dataset names to the dataset. We add the dataset_name column to the dataset if:
+
+        1. The loss is a dictionary and the dataset is a DatasetDict, or
+        2. The prompts contain a mapping to dataset names.
+
+        There are 4 cases for the prompts:
+
+        1. `str`: One prompt for all datasets and columns.
+        2. `dict[str, str]`: A column to prompt mapping.
+        3. `dict[str, str]`: A dataset to prompt mapping.
+        4. `dict[str, dict[str, str]]`: A dataset to column to prompt mapping.
+
+        And 2 cases for the dataset:
+
+        A. `Dataset`: A single dataset.
+        B. `DatasetDict`: A dictionary of datasets.
+
+        3A is not allowed, and 2A doesn't make sense.
+
+        Args:
+            dataset_dict (DatasetDict | Dataset | None): The dataset to add prompts or dataset names to.
+
+        Returns:
+            DatasetDict | Dataset | None: The dataset with prompts or dataset names added.
+        """
+        if dataset_dict is None:
+            return None
+
+        include_dataset_name = isinstance(self.loss, dict)
+
+        # If we've already added the transform to this (iterable) dataset, don't add it again
+        if hasattr(dataset_dict, "_sentence_transformers_preprocessed"):
+            return dataset_dict
+
+        # Ensure that there's no "dataset_name"/"return_loss" columns in the unprocessed datasets
+        self.validate_column_names(dataset_dict, dataset_name=dataset_name)
+
+        # Only add if 1) we have prompts or 2) we need the dataset name for the loss dictionary
+        if prompts or include_dataset_name:
+            include_prompt_lengths = self._include_prompt_length()
+            dataset_dict = self.add_prompts_or_dataset_name_column(
+                dataset_dict,
+                prompts=prompts,
+                include_prompt_lengths=include_prompt_lengths,
+                include_dataset_name=include_dataset_name,
+            )
+        return dataset_dict
+
+    def add_prompts_or_dataset_name_column(
+        self,
+        dataset_dict: Union[DatasetDict, IterableDatasetDict, Dataset, IterableDataset],
+        prompts: Optional[Union[Dict[str, str], str]] = None,
+        dataset_name: Optional[str] = None,
+        include_prompt_lengths: bool = False,
+        include_dataset_name: bool = False,
+    ) -> Union[DatasetDict, Dataset, None]:
+        # If we have DatasetDict, recurse
+        if isinstance(dataset_dict, (IterableDatasetDict, DatasetDict)):
+            for dataset_name, dataset in dataset_dict.items():
+                # If prompts is a dictionary that matches the dataset names, then take the nested prompts
+                nested_prompts = prompts.get(dataset_name, prompts) if isinstance(prompts, dict) else prompts
+                dataset_dict[dataset_name] = self.add_prompts_or_dataset_name_column(
+                    dataset_dict=dataset,
+                    prompts=nested_prompts,
+                    dataset_name=dataset_name if include_dataset_name else None,
+                    include_prompt_lengths=include_prompt_lengths,
+                    include_dataset_name=include_dataset_name,
+                )
+            return dataset_dict
+
+        # Get the prompt lengths if needed for the pooling layer
+        prompt_lengths = None
+        if prompts:
+            if isinstance(prompts, str):
+                if include_prompt_lengths:
+                    prompt_lengths = self._get_prompt_length(prompts)
+            elif isinstance(prompts, dict):
+                first_key = list(prompts.keys())[0]
+                if isinstance(prompts[first_key], dict):
+                    raise ValueError(
+                        "The prompts provided to the trainer are a nested dictionary. In this setting, the first "
+                        "level of the dictionary should map to dataset names and the second level to column names. "
+                        "However, as the provided dataset is a not a DatasetDict, no dataset names can be inferred. "
+                        f"The keys to the provided prompts dictionary are {list(prompts.keys())!r}"
+                    )
+                if include_prompt_lengths:
+                    # If prompt columns exist, add the prompt length column
+                    prompt_lengths = {
+                        prompt: self._get_prompt_length(prompt)
+                        for column_name, prompt in prompts.items()
+                        if column_name in dataset_dict.column_names
+                    }
+
+        # If we have a Dataset, we can set the transform directly...
+        if isinstance(dataset_dict, Dataset):
+            dataset_dict.set_transform(
+                partial(
+                    self.add_prompts_or_dataset_name_transform,
+                    prompts=prompts,
+                    prompt_lengths=prompt_lengths,
+                    dataset_name=dataset_name,
+                    **dataset_dict._format_kwargs,
+                )
+            )
+
+        # ... otherwise, we have an IterableDataset and we need to map it, which performs the same operation as above
+        elif isinstance(dataset_dict, IterableDataset):
+            # Update the features to include the new columns
+            features = dataset_dict.features
+            if dataset_name:
+                features["dataset_name"] = Value("string")
+            if prompt_lengths:
+                if isinstance(prompts, str):
+                    for column_name in dataset_dict.column_names:
+                        feature = features[column_name]
+                        if isinstance(feature, Value) and feature.dtype in ("string", "large_string"):
+                            features[f"{column_name}_prompt_length"] = Value("int16")
+                elif isinstance(prompts, dict):
+                    for column_name, prompt in prompts.items():
+                        feature = features[column_name]
+                        if (
+                            prompt in prompt_lengths
+                            and isinstance(feature, Value)
+                            and feature.dtype in ("string", "large_string")
+                        ):
+                            features[f"{column_name}_prompt_length"] = Value("int16")
+
+            dataset_dict = dataset_dict.map(
+                partial(
+                    self.add_prompts_or_dataset_name_transform,
+                    prompts=prompts,
+                    prompt_lengths=prompt_lengths,
+                    dataset_name=dataset_name,
+                ),
+                batched=True,
+                features=features,
+            )
+
+        else:
+            raise ValueError("Unsupported dataset type.")
+
+        # Add a tag to the dataset to indicate that it has been preprocessed, to ensure that we don't apply the map or
+        # transform multiple times.
+        dataset_dict._sentence_transformers_preprocessed = True
+        return dataset_dict
+
     def create_model_card(
         self,
         language: Optional[str] = None,
         license: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Optional[Union[str, List[str]]] = None,
         model_name: Optional[str] = None,
         finetuned_from: Optional[str] = None,
-        tasks: Union[str, List[str], None] = None,
-        dataset_tags: Union[str, List[str], None] = None,
-        dataset: Union[str, List[str], None] = None,
-        dataset_args: Union[str, List[str], None] = None,
+        tasks: Optional[Union[str, List[str]]] = None,
+        dataset_tags: Optional[Union[str, List[str]]] = None,
+        dataset: Optional[Union[str, List[str]]] = None,
+        dataset_args: Optional[Union[str, List[str]]] = None,
         **kwargs,
     ) -> None:
         if not self.is_world_process_zero():
@@ -762,3 +1200,41 @@ def create_model_card(
             self.model.model_card_data.add_tags(tags)
 
         self.model._create_model_card(self.args.output_dir, model_name=model_name)
+
+    def get_optimizer_cls_and_kwargs(
+        self, args: SentenceTransformerGaudiTrainingArguments, model: Optional[SentenceTransformer] = None
+    ) -> Tuple[Any, Any]:
+        """
+        We have to override the optimizer_grouped_parameters because the Trainer superclass bases it on the `model`
+        itself, but the SentenceTransformer losses can have weights that should be updated as well, e.g.
+        SoftmaxLoss (see #2872).
+
+        This method requires `transformers` >= 4.43.0.
+        """
+
+        if isinstance(self.loss, dict):
+            loss_model = torch.nn.Sequential(OrderedDict(self.loss))
+        else:
+            loss_model = self.loss
+        optimizer_cls, optimizer_kwargs = super().get_optimizer_cls_and_kwargs(args, loss_model)
+
+        # If the kwargs were not overridden by the super() call, then we should override them here so that the potential
+        # weights in the loss(es) can also be updated.
+        if not {"params", "model", "optimizer_dict"} & set(optimizer_kwargs.keys()):
+            decay_parameters = self.get_decay_parameter_names(loss_model)
+            optimizer_kwargs["optimizer_dict"] = [
+                {
+                    "params": [
+                        p for n, p in loss_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": self.args.weight_decay,
+                },
+                {
+                    "params": [
+                        p for n, p in loss_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": 0.0,
+                },
+            ]
+
+        return optimizer_cls, optimizer_kwargs
diff --git a/optimum/habana/sentence_transformers/st_gaudi_training_args.py b/optimum/habana/sentence_transformers/st_gaudi_training_args.py
index 07f98c3fbc..b47434c10a 100644
--- a/optimum/habana/sentence_transformers/st_gaudi_training_args.py
+++ b/optimum/habana/sentence_transformers/st_gaudi_training_args.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import logging
 from dataclasses import dataclass, field
-from typing import Union
+from typing import Dict, Optional, Union
 
 from sentence_transformers.training_args import BatchSamplers, MultiDatasetBatchSamplers
 from transformers.training_args import ParallelMode
@@ -28,9 +28,38 @@
 @dataclass
 class SentenceTransformerGaudiTrainingArguments(GaudiTrainingArguments):
     """
-    Inherits from GaudiTrainingArguments and adapted from: https://github.com/UKPLab/sentence-transformers/blob/v3.0.1/sentence_transformers/training_args.py
+    SentenceTransformerGaudiTrainingArguments extends :class:`~transformers.TrainingArguments` with additional arguments
+    specific to Sentence Transformers. See :class:`~transformers.TrainingArguments` for the complete list of
+    available arguments.
+
+    It inherits from GaudiTrainingArguments and adapted from:
+        https://github.com/UKPLab/sentence-transformers/blob/v3.3.1/sentence_transformers/training_args.py
+
+    Args:
+        output_dir (`str`):
+            The output directory where the model checkpoints will be written.
+        prompts (`Union[Dict[str, Dict[str, str]], Dict[str, str], str]`, *optional*):
+            The prompts to use for each column in the training, evaluation and test datasets. Four formats are accepted:
+
+            1. `str`: A single prompt to use for all columns in the datasets, regardless of whether the training/evaluation/test
+               datasets are :class:`datasets.Dataset` or a :class:`datasets.DatasetDict`.
+            2. `Dict[str, str]`: A dictionary mapping column names to prompts, regardless of whether the training/evaluation/test
+               datasets are :class:`datasets.Dataset` or a :class:`datasets.DatasetDict`.
+            3. `Dict[str, str]`: A dictionary mapping dataset names to prompts. This should only be used if your training/evaluation/test
+               datasets are a :class:`datasets.DatasetDict` or a dictionary of :class:`datasets.Dataset`.
+            4. `Dict[str, Dict[str, str]]`: A dictionary mapping dataset names to dictionaries mapping column names to
+               prompts. This should only be used if your training/evaluation/test datasets are a
+               :class:`datasets.DatasetDict` or a dictionary of :class:`datasets.Dataset`.
+
+        batch_sampler (Union[:class:`~sentence_transformers.training_args.BatchSamplers`, `str`], *optional*):
+            The batch sampler to use. See :class:`~sentence_transformers.training_args.BatchSamplers` for valid options.
+            Defaults to ``BatchSamplers.BATCH_SAMPLER``.
+        multi_dataset_batch_sampler (Union[:class:`~sentence_transformers.training_args.MultiDatasetBatchSamplers`, `str`], *optional*):
+            The multi-dataset batch sampler to use. See :class:`~sentence_transformers.training_args.MultiDatasetBatchSamplers`
+            for valid options. Defaults to ``MultiDatasetBatchSamplers.PROPORTIONAL``.
     """
 
+    prompts: Optional[Union[Dict[str, Dict[str, str]], Dict[str, str], str]] = None
     batch_sampler: Union[BatchSamplers, str] = field(
         default=BatchSamplers.BATCH_SAMPLER, metadata={"help": "The batch sampler to use."}
     )
diff --git a/optimum/habana/transformers/generation/configuration_utils.py b/optimum/habana/transformers/generation/configuration_utils.py
index 5a4c797a2b..4ed9cd80a2 100644
--- a/optimum/habana/transformers/generation/configuration_utils.py
+++ b/optimum/habana/transformers/generation/configuration_utils.py
@@ -21,6 +21,8 @@ class GaudiGenerationConfig(GenerationConfig):
         is also running in lower precision.
     limit_hpu_graphs (`bool`, *optional*):
         Skip HPU Graph usage for first token to save memory
+    clear_hpu_graphs_cache (`bool`, *optional*):
+        Clear HPU Graph cache
     reuse_cache (`bool`, *optional*):
         Whether to reuse key/value cache for decoding. It should save memory.
     bucket_size (`int`, *optional*):
@@ -48,6 +50,7 @@ def __init__(self, **kwargs):
         self.ignore_eos = kwargs.get("ignore_eos", None)
         self.attn_softmax_bf16 = kwargs.get("attn_softmax_bf16", None)
         self.limit_hpu_graphs = kwargs.get("limit_hpu_graphs", None)
+        self.clear_hpu_graphs_cache = kwargs.get("clear_hpu_graphs_cache", None)
         self.reuse_cache = kwargs.get("reuse_cache", None)
         self.bucket_size = kwargs.get("bucket_size", -1)
         self.bucket_internal = kwargs.get("bucket_internal", None)
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
old mode 100644
new mode 100755
index f371b51c9a..bcb4d74e5b
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -112,10 +112,12 @@
     "paligemma",
     "idefics2",
     "mllama",
+    "video_llava",
     "minicpm3",
     "baichuan",
     "deepseek_v2",
     "chatglm",
+    "qwen2_vl",
 ]
 
 # Initial generated token index is set to 1 to accomodate SOS (start of string) token.
@@ -211,20 +213,19 @@ def _prepare_decoder_input_ids_for_generation(
         # 2. `decoder_start_token_id` must have shape (batch_size, 1)
         if device is None:
             device = self.device
-        if decoder_start_token_id.ndim == 1:
-            if decoder_start_token_id.shape[0] != batch_size:
-                raise ValueError(
-                    f"`decoder_start_token_id` expected to have length {batch_size} but got {decoder_start_token_id.shape[0]}"
+        if token_idx is None:
+            if decoder_start_token_id.ndim == 1:
+                if decoder_start_token_id.shape[0] != batch_size:
+                    raise ValueError(
+                        f"`decoder_start_token_id` expected to have length {batch_size} but got {decoder_start_token_id.shape[0]}"
+                    )
+                decoder_start_token_id = decoder_start_token_id.view(-1, 1)
+            else:
+                decoder_start_token_id = (
+                    torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
                 )
-            decoder_start_token_id = decoder_start_token_id.view(-1, 1)
         else:
-            decoder_start_token_id = (
-                torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
-            )
-
-        if token_idx is not None:
-            # creating padded decoder_input_ids to achieve static shapes.
-            # Later new tokens once generated are copied in to decoder_input_ids based on token_idx
+            # creating padded decoder_input_ids to achieve static shapes. Later new tokens once generated are copied in to decoder_input_ids based on token_idx
             max_length = max_new_tokens + 1 if max_new_tokens is not None else self.generation_config.max_length
             decoder_start_token_id = (
                 torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
@@ -1079,28 +1080,28 @@ def generate(
             assert generation_config.bucket_size >= 0, "please set bucket_size to use bucket_internal"
             assert generation_config.use_cache, "please set use_cache flag to use bucket_internal"
         if generation_config.reuse_cache:
-            assert (
-                self.config.model_type
-                in [
-                    "llama",
-                    "mistral",
-                    "falcon",
-                    "mixtral",
-                    "phi",
-                    "qwen2",
-                    "gptj",
-                    "starcoder2",
-                    "qwen2_moe",
-                    "gemma",
-                    "gemma2",
-                    "baichuan",
-                    "chatglm",
-                ]
-            ), "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2, qwen2_moe, gemma, gemma2, starcoder2, baichuan and chatglm at the moment"
+            assert self.config.model_type in [
+                "llama",
+                "mistral",
+                "falcon",
+                "mixtral",
+                "phi",
+                "qwen2",
+                "gptj",
+                "starcoder2",
+                "qwen2_moe",
+                "gemma",
+                "gemma2",
+                "baichuan",
+                "chatglm",
+                "deepseek_v2",
+            ], (
+                "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2, qwen2_moe, gemma, gemma2, starcoder2, baichuan, chatglm and deepseek_v2 at the moment"
+            )
             if not generation_config.bucket_internal:
-                assert (
-                    generation_config.bucket_size <= 0
-                ), "please set bucket_internal along with reuse_cache and bucket_size"
+                assert generation_config.bucket_size <= 0, (
+                    "please set bucket_internal along with reuse_cache and bucket_size"
+                )
             else:
                 assert generation_config.bucket_size >= 0, "please set valid bucket_size to use bucket_internal"
 
@@ -1263,6 +1264,9 @@ def generate(
         model_kwargs["use_hpu_graphs"] = hpu_graphs
         model_kwargs["limit_hpu_graphs"] = generation_config.limit_hpu_graphs
 
+        # determine whether to clear hpu graphs cache
+        model_kwargs["clear_hpu_graphs_cache"] = generation_config.clear_hpu_graphs_cache
+
         # prepare for allocate kv cache
         model_kwargs["reuse_cache"] = generation_config.reuse_cache
 
@@ -1305,6 +1309,7 @@ def generate(
                 "gemma2",
                 "qwen2_moe",
                 "baichuan",
+                "deepseek_v2",
             ]:
                 if (
                     hasattr(self.config, "max_position_embeddings")
@@ -2472,9 +2477,9 @@ def _sample(
                             output_idx = torch.tensor(outputs.logits.shape[-2], device=input_ids.device)
                         else:
                             output_idx = token_idx + outputs.logits.shape[-2] - input_ids.shape[-1]
-                        next_token_logits = torch.index_select(outputs.logits, -2, output_idx - 1).squeeze(-2)
+                        next_token_logits = torch.index_select(outputs.logits, -2, output_idx - 1).squeeze(-2).float()
                     else:
-                        next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
+                        next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2).float()
                     next_token_scores = logits_processor(input_ids, next_token_logits)
             else:
                 # .float() is needed to retain precision for later logits manipulations
@@ -2508,7 +2513,9 @@ def _sample(
 
             # token selection
             if do_sample:
-                probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+                # Workaround on HPU for output quality issues with torch.multinomial for lower precision models
+                # Distribution sampled by torch.multinomial may be affected by next_token_logits upcast to float
+                probs = torch.nn.functional.softmax(next_token_scores, dim=-1).to(outputs.logits.dtype)
                 # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
                 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
             else:
@@ -2616,8 +2623,14 @@ def _sample(
             and not model_kwargs.get("reuse_cache", False)
             and bucket_internal
         ):
+            # Clear HPU graphs cache
+            if model_kwargs.get("clear_hpu_graphs_cache", False):
+                self.clear_cache()
+
             # Clear HPU graphs input tensors of the decode phase after the full generation while loop
-            self.clear_inputs()
+            else:
+                self.clear_inputs()
+
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
@@ -3043,8 +3056,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                 if self.generation_config.early_stopping:
                     num_eos_tokens.add_(beam_tokens[0:num_beams].eq(self.config.eos_token_id).sum())
 
-                if self.config.eos_token_id is not None:
-                    beam_scores.add_(torch.where(beam_tokens.eq(self.config.eos_token_id), float("-inf"), 0.0))
+                beam_scores.add_(torch.where(beam_tokens.eq(self.config.eos_token_id), float("-inf"), 0.0))
                 beam_scores = beam_scores.view(batch_size, -1).unsqueeze(0)
                 _, selected = torch.topk(beam_scores, k=num_beams, dim=-1, largest=True, sorted=True)
                 offset = torch.arange(0, torch.numel(beam_scores), beam_scores.shape[-1]).unsqueeze(-1)
@@ -3216,9 +3228,6 @@ def move(obj, device):
             if not output_scores:
                 sequence_outputs["sequence_scores"] = None
 
-            if self.generation_config.static_shapes:
-                raise NotImplementedError("sequence_scores is not implemented for static_shapes")
-
             if self.config.is_encoder_decoder:
                 return GenerateBeamEncoderDecoderOutput(
                     sequences=sequence_outputs["sequences"],
diff --git a/optimum/habana/transformers/integrations/awq.py b/optimum/habana/transformers/integrations/awq.py
new file mode 100644
index 0000000000..7ad1cd454c
--- /dev/null
+++ b/optimum/habana/transformers/integrations/awq.py
@@ -0,0 +1,216 @@
+import importlib
+from enum import Enum
+
+import torch.nn as nn
+from packaging import version
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_accelerate_available, is_auto_awq_available
+from transformers.utils.quantization_config import (
+    AwqBackendPackingMethod,
+)
+
+from optimum.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GaudiAWQLinearVersion(str, Enum):
+    GEMM = "gemm"
+    GEMV = "gemv"
+    EXLLAMA = "exllama"
+    HPU = "hpu"
+
+    @staticmethod
+    def from_str(version: str):
+        version = version.lower()
+        if version == "gemm":
+            return GaudiAWQLinearVersion.GEMM
+        elif version == "gemv":
+            return GaudiAWQLinearVersion.GEMV
+        elif version == "exllama":
+            return GaudiAWQLinearVersion.EXLLAMA
+        elif version == "hpu":
+            return GaudiAWQLinearVersion.HPU
+        else:
+            raise ValueError(f"Unknown GaudiAWQLinearVersion {version}")
+
+
+# override post_init in AwqConfig
+def gaudi_awq_config_post_init(self):
+    """
+    Adapted from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/utils/quantization_config.py#L818
+    - support HPU.
+    """
+    if self.backend not in [AwqBackendPackingMethod.AUTOAWQ]:
+        raise ValueError(
+            f"Only supported quantization backends in {AwqBackendPackingMethod.AUTOAWQ} - not recognized backend {self.backend}"
+        )
+
+    self.version = GaudiAWQLinearVersion.from_str(self.version)
+    if self.version not in [
+        GaudiAWQLinearVersion.HPU,
+        GaudiAWQLinearVersion.GEMM,
+    ]:
+        raise ValueError(
+            f"Only supported versions are in [GaudiAWQLinearVersion.HPU, GaudiAWQLinearVersion.GEMM] - not recognized version {self.version}"
+        )
+
+    if self.do_fuse and self.fuse_max_seq_len is None:
+        raise ValueError(
+            "You cannot enable fused modules without specifying a `fuse_max_seq_len`, make sure to pass a valid `fuse_max_seq_len` for your usecase"
+        )
+
+    if self.do_fuse:
+        awq_version_supports_fusing = False
+        MIN_AWQ_VERSION = "0.1.7"
+        if is_auto_awq_available():
+            awq_version_supports_fusing = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
+                MIN_AWQ_VERSION
+            )
+
+        if not awq_version_supports_fusing:
+            raise ValueError(
+                f"You current version of `autoawq` does not support module fusing, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
+            )
+
+    if self.modules_to_not_convert is not None:
+        awq_version_supports_non_conversion = False
+        MIN_AWQ_VERSION = "0.1.8"
+        if is_auto_awq_available():
+            awq_version_supports_non_conversion = version.parse(
+                importlib.metadata.version("autoawq")
+            ) >= version.parse(MIN_AWQ_VERSION)
+
+        if not awq_version_supports_non_conversion:
+            raise ValueError(
+                f"You current version of `autoawq` does not support module quantization skipping, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
+            )
+
+    if self.do_fuse and self.modules_to_fuse is not None:
+        raise ValueError("You current implementation of `autoawq` does not support do_fuse and modules_to_fuse.")
+
+
+def gaudi_replace_with_awq_linear(
+    model,
+    modules_to_not_convert=None,
+    quantization_config=None,
+    current_key_name=None,
+    has_been_replaced=False,
+) -> bool:
+    """
+    Adapted from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/integrations/awq.py#L90
+    - support HPU.
+    """
+    if modules_to_not_convert is None:
+        modules_to_not_convert = []
+
+    assert quantization_config is not None
+    backend = quantization_config.backend
+
+    if not is_auto_awq_available():
+        raise ValueError(
+            "AWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awq"
+        )
+
+    if backend == AwqBackendPackingMethod.AUTOAWQ and quantization_config.version == GaudiAWQLinearVersion.HPU:
+        from optimum.habana.AutoAWQ.gemm_hpu import WQLinear_HPU
+
+        target_cls = WQLinear_HPU
+    else:
+        raise ValueError(f"Unrecognized AWQ version: {quantization_config.version} and backend {backend}")
+
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
+                in_features = module.in_features
+                out_features = module.out_features
+
+                model._modules[name] = target_cls(
+                    w_bit=quantization_config.bits,
+                    group_size=quantization_config.group_size,
+                    in_features=in_features,
+                    out_features=out_features,
+                    bias=module.bias is not None,
+                    dev=module.weight.device,
+                )
+                has_been_replaced = True
+
+                # Force requires grad to False to avoid unexpected errors
+                model._modules[name].requires_grad_(False)
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = gaudi_replace_with_awq_linear(
+                module,
+                modules_to_not_convert=modules_to_not_convert,
+                current_key_name=current_key_name,
+                quantization_config=quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def post_init_awq_gemm_hpu_modules(model):
+    """
+    Runs post init for gemm hpu layers which performs:
+        - Weights unpacking, reordering and repacking
+    """
+    from optimum.habana.AutoAWQ.gemm_hpu import hpu_post_init
+
+    model = hpu_post_init(model)
+
+    return model
+
+
+def gaudi_awq_quantizer_process_model_after_weight_loading(self, model):
+    if self.quantization_config.version == GaudiAWQLinearVersion.HPU:
+        model = post_init_awq_gemm_hpu_modules(model)
+    else:
+        raise ValueError(f"Unrecognized AWQ version: {self.quantization_config.version}, only hpu is supported")
+
+
+def gaudi_awq_quantizer_validate_environment(self, device_map, **kwargs):
+    if not is_auto_awq_available():
+        raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)")
+
+    if not is_accelerate_available():
+        raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)")
+
+    if device_map is None:
+        logger.warning_once(
+            "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set "
+            "your model on a GPU device in order to run your model."
+        )
+    elif device_map is not None:
+        if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+            raise ValueError(
+                "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
+                " This is not supported. Please remove the CPU or disk device from the device_map."
+            )
+
+
+def gaudi_awq_quantizer_process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+    from transformers.integrations import get_keys_to_not_convert, replace_quantization_scales
+
+    self.modules_to_not_convert = get_keys_to_not_convert(model)
+
+    if self.quantization_config.modules_to_not_convert is not None:
+        self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+
+    model, has_been_replaced = gaudi_replace_with_awq_linear(
+        model, quantization_config=self.quantization_config, modules_to_not_convert=self.modules_to_not_convert
+    )
+
+    model = replace_quantization_scales(model, model.config.model_type)
+
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading an AWQ model but no linear modules were found in your model."
+            " Please double check your model architecture, or submit an issue on github if you think this is a bug."
+        )
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
index ee092ecff9..dc3d5a9211 100644
--- a/optimum/habana/transformers/modeling_utils.py
+++ b/optimum/habana/transformers/modeling_utils.py
@@ -12,13 +12,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 
 import accelerate
 import transformers
 import transformers.utils.fx
 
 from ..accelerate.utils import extract_model_from_parallel
+from ..accelerate.utils.modeling import gaudi_check_device_same
+from ..quantizers.bitsandbytes import (
+    gaudi_bitsandbytesconfig_post_init,
+    gaudi_create_quantized_param,
+    gaudi_is_bitsandbytes_available,
+    gaudi_validate_bnb_backend_availability,
+    gaudi_validate_environment,
+)
 from .generation import (
     GaudiGenerationConfig,
     GaudiGenerationMixin,
@@ -27,6 +34,13 @@
     gaudi_MaxTimeCriteria_call,
     gaudi_StoppingCriteriaList_call,
 )
+from .integrations.awq import (
+    GaudiAWQLinearVersion,
+    gaudi_awq_config_post_init,
+    gaudi_awq_quantizer_process_model_after_weight_loading,
+    gaudi_awq_quantizer_process_model_before_weight_loading,
+    gaudi_awq_quantizer_validate_environment,
+)
 from .models import (
     GAUDI_WHISPER_ATTENTION_CLASSES,
     BaichuanConfig,
@@ -140,6 +154,13 @@
     GaudiQwen2MoeForCausalLM,
     GaudiQwen2MoeMLP,
     GaudiQwen2MoeModel,
+    GaudiQwen2VisionSdpaAttention,
+    GaudiQwen2VisionTransformerPretrainedModel,
+    GaudiQwen2VLDecoderLayer,
+    GaudiQwen2VLForConditionalGeneration,
+    GaudiQwen2VLModel,
+    GaudiQwen2VLSdpaAttention,
+    GaudiQwen2VLVisionBlock,
     GaudiStableLmAttention,
     GaudiStableLmDecoderLayer,
     GaudiStableLmForCausalLM,
@@ -147,6 +168,7 @@
     GaudiStarcoder2DecoderLayer,
     GaudiStarcoder2ForCausalLM,
     GaudiStarcoder2Model,
+    GaudiVideoLlavaForConditionalGeneration,
     GaudiWav2Vec2SdpaAttention,
     GaudiWhisperDecoder,
     GaudiWhisperDecoderLayer,
@@ -193,6 +215,12 @@
     gaudi_cohere_model_forward,
     gaudi_conv1d_forward,
     gaudi_DetrConvModel_forward,
+    gaudi_DetrHungarianMatcher_forward,
+    gaudi_DetrLoss_forward,
+    gaudi_DetrLoss_get_targets_without_no_objects,
+    gaudi_DetrLoss_loss_boxes,
+    gaudi_DetrLoss_loss_cardinality,
+    gaudi_DetrLoss_loss_labels,
     gaudi_esm_for_protein_folding_forward,
     gaudi_esmfolding_trunk_forward,
     gaudi_falcon_linear_forward,
@@ -214,6 +242,7 @@
     gaudi_MambaForCausalLM_update_model_kwargs_for_generation,
     gaudi_mistral_rmsnorm_forward,
     gaudi_mixtral_block_dynamic_moe_forward,
+    gaudi_mixtral_block_moe_forward,
     gaudi_mixtral_block_sparse_moe_forward,
     gaudi_mixtral_rmsnorm_forward,
     gaudi_opt_attention_forward,
@@ -259,6 +288,7 @@
     gaudi_xglm_attention_forward,
     gaudi_xglm_decoder_layer_forward,
     gaudi_xglm_model_forward,
+    gaudi_XLMRoberta_Sdpa_SelfAttention_forward,
 )
 
 
@@ -270,6 +300,15 @@ def adapt_transformers_to_gaudi():
     accelerate.utils.extract_model_from_parallel = extract_model_from_parallel
     accelerate.utils.other.extract_model_from_parallel = extract_model_from_parallel
     accelerate.accelerator.extract_model_from_parallel = extract_model_from_parallel
+    accelerate.utils.modeling.check_device_same = gaudi_check_device_same
+
+    transformers.utils.quantization_config.BitsAndBytesConfig.post_init = gaudi_bitsandbytesconfig_post_init
+    transformers.utils.import_utils.is_bitsandbytes_available = gaudi_is_bitsandbytes_available
+    transformers.utils.is_bitsandbytes_available = gaudi_is_bitsandbytes_available
+    transformers.quantizers.quantizer_bnb_4bit.is_bitsandbytes_available = gaudi_is_bitsandbytes_available
+    transformers.integrations.bitsandbytes.validate_bnb_backend_availability = gaudi_validate_bnb_backend_availability
+    transformers.quantizers.quantizer_bnb_4bit.Bnb4BitHfQuantizer.validate_environment = gaudi_validate_environment
+    transformers.quantizers.quantizer_bnb_4bit.Bnb4BitHfQuantizer.create_quantized_param = gaudi_create_quantized_param
 
     # models that support symbolic tracing should be added to this list
     models_with_tracing_support = []
@@ -555,15 +594,13 @@ def adapt_transformers_to_gaudi():
     transformers.models.mixtral.modeling_mixtral.MixtralAttention = GaudiMixtralAttention
     transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM = GaudiMixtralForCausalLM
     transformers.models.mixtral.modeling_mixtral.MixtralModel = GaudiMixtralModel
-    # We need this workaround until moe op in hpu is supporting fp8
-    if os.environ.get("QUANT_CONFIG"):
-        transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock.forward = (
-            gaudi_mixtral_block_sparse_moe_forward
-        )
-    else:
-        transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock.forward = (
-            gaudi_mixtral_block_dynamic_moe_forward
-        )
+    transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock.sparse_moe_forward = (
+        gaudi_mixtral_block_sparse_moe_forward
+    )
+    transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock.dynamic_moe_forward = (
+        gaudi_mixtral_block_dynamic_moe_forward
+    )
+    transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock.forward = gaudi_mixtral_block_moe_forward
     transformers.models.mixtral.modeling_mixtral.MixtralDecoderLayer = GaudiMixtralDecoderLayer
     transformers.models.mixtral.modeling_mixtral.MixtralRMSNorm.forward = gaudi_mixtral_rmsnorm_forward
     transformers.models.mixtral.configuration_mixtral.MixtralConfig = MixtralConfig
@@ -644,6 +681,19 @@ def adapt_transformers_to_gaudi():
         gaudi_qwen2moe_block_sparse_moe_forward
     )
 
+    # Optimization for qwen2-vl Gaudi
+    transformers.models.qwen2_vl.modeling_qwen2_vl.VisionSdpaAttention = GaudiQwen2VisionSdpaAttention
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLVisionBlock = GaudiQwen2VLVisionBlock
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VisionTransformerPretrainedModel = (
+        GaudiQwen2VisionTransformerPretrainedModel
+    )
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLSdpaAttention = GaudiQwen2VLSdpaAttention
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLDecoderLayer = GaudiQwen2VLDecoderLayer
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLModel = GaudiQwen2VLModel
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLForConditionalGeneration = (
+        GaudiQwen2VLForConditionalGeneration
+    )
+
     # Optimization for stablelm on Gaudi
     transformers.models.stablelm.modeling_stablelm.StableLmAttention = GaudiStableLmAttention
     transformers.models.stablelm.modeling_stablelm.StableLmDecoderLayer = GaudiStableLmDecoderLayer
@@ -657,9 +707,6 @@ def adapt_transformers_to_gaudi():
         gaudi_owlvitclasspredictionhead_forward
     )
 
-    # Optimization for DETR model on Gaudi
-    transformers.models.detr.modeling_detr.DetrConvModel.forward = gaudi_DetrConvModel_forward
-
     # Tell transformers which Gaudi models support tracing
     transformers.utils.fx._SUPPORTED_MODELS += tuple(cls.__name__ for cls in models_with_tracing_support)
 
@@ -676,6 +723,11 @@ def adapt_transformers_to_gaudi():
     transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaModel.forward = gaudi_FalconMambaModel_forward
     transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaRMSNorm.forward = gaudi_llama_rmsnorm_forward
 
+    # Optimization for VideoLlava on Gaudi
+    transformers.models.video_llava.modeling_video_llava.VideoLlavaForConditionalGeneration = (
+        GaudiVideoLlavaForConditionalGeneration
+    )
+
     # Optimization for Whisper on Gaudi
     transformers.models.whisper.modeling_whisper.WhisperSdpaAttention = GaudiWhisperSdpaAttention
     transformers.models.whisper.modeling_whisper.WhisperDecoderLayer = GaudiWhisperDecoderLayer
@@ -719,6 +771,11 @@ def adapt_transformers_to_gaudi():
     transformers.AutoConfig.register("minicpm3", MiniCPM3Config)
     transformers.AutoModelForCausalLM.register(MiniCPM3Config, MiniCPM3ForCausalLM)
 
+    # Optimization for XLMRoberta model on Gaudi
+    transformers.models.xlm_roberta.modeling_xlm_roberta.XLMRobertaSdpaSelfAttention.forward = (
+        gaudi_XLMRoberta_Sdpa_SelfAttention_forward
+    )
+
     # Optimization for Baichuan2 on Gaudi
     transformers.AutoConfig.register("baichuan", BaichuanConfig)
     transformers.AutoTokenizer.register(BaichuanConfig, slow_tokenizer_class=BaichuanTokenizer)
@@ -731,3 +788,24 @@ def adapt_transformers_to_gaudi():
     transformers.AutoModelForCausalLM.register(ChatGLMConfig, ChatGLMForConditionalGeneration)
     transformers.AutoModelForSeq2SeqLM.register(ChatGLMConfig, ChatGLMForConditionalGeneration)
     transformers.AutoModelForSequenceClassification.register(ChatGLMConfig, ChatGLMForSequenceClassification)
+
+    transformers.quantizers.quantizer_awq.AwqQuantizer.validate_environment = gaudi_awq_quantizer_validate_environment
+    transformers.quantizers.quantizer_awq.AwqQuantizer._process_model_before_weight_loading = (
+        gaudi_awq_quantizer_process_model_before_weight_loading
+    )
+    transformers.quantizers.quantizer_awq.AwqQuantizer._process_model_after_weight_loading = (
+        gaudi_awq_quantizer_process_model_after_weight_loading
+    )
+    transformers.utils.quantization_config.AWQLinearVersion = GaudiAWQLinearVersion
+    transformers.utils.quantization_config.AwqConfig.post_init = gaudi_awq_config_post_init
+
+    # Optimization for DETR model on Gaudi
+    transformers.models.detr.modeling_detr.DetrConvModel.forward = gaudi_DetrConvModel_forward
+    transformers.models.detr.modeling_detr.DetrHungarianMatcher.forward = gaudi_DetrHungarianMatcher_forward
+    transformers.models.detr.modeling_detr.DetrLoss.get_targets_without_no_objects = (
+        gaudi_DetrLoss_get_targets_without_no_objects
+    )
+    transformers.models.detr.modeling_detr.DetrLoss.loss_labels = gaudi_DetrLoss_loss_labels
+    transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality = gaudi_DetrLoss_loss_cardinality
+    transformers.models.detr.modeling_detr.DetrLoss.loss_boxes = gaudi_DetrLoss_loss_boxes
+    transformers.models.detr.modeling_detr.DetrLoss.forward = gaudi_DetrLoss_forward
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
index 2a5e685942..a0a6f47f6e 100644
--- a/optimum/habana/transformers/models/__init__.py
+++ b/optimum/habana/transformers/models/__init__.py
@@ -71,7 +71,15 @@
     DeepseekV2Config,
     DeepseekV2ForCausalLM,
 )
-from .detr import gaudi_DetrConvModel_forward
+from .detr import (
+    gaudi_DetrConvModel_forward,
+    gaudi_DetrHungarianMatcher_forward,
+    gaudi_DetrLoss_forward,
+    gaudi_DetrLoss_get_targets_without_no_objects,
+    gaudi_DetrLoss_loss_boxes,
+    gaudi_DetrLoss_loss_cardinality,
+    gaudi_DetrLoss_loss_labels,
+)
 from .esm import (
     gaudi_esm_for_protein_folding_forward,
     gaudi_esmfolding_trunk_forward,
@@ -177,6 +185,7 @@
     GaudiMixtralModel,
     MixtralConfig,
     gaudi_mixtral_block_dynamic_moe_forward,
+    gaudi_mixtral_block_moe_forward,
     gaudi_mixtral_block_sparse_moe_forward,
     gaudi_mixtral_rmsnorm_forward,
 )
@@ -247,6 +256,15 @@
     gaudi_qwen2moe_block_sparse_moe_forward,
     gaudi_qwen2moe_rmsnorm_forward,
 )
+from .qwen2_vl import (
+    GaudiQwen2VisionSdpaAttention,
+    GaudiQwen2VisionTransformerPretrainedModel,
+    GaudiQwen2VLDecoderLayer,
+    GaudiQwen2VLForConditionalGeneration,
+    GaudiQwen2VLModel,
+    GaudiQwen2VLSdpaAttention,
+    GaudiQwen2VLVisionBlock,
+)
 from .seamless_m4t import (
     gaudi_SeamlessM4TAttention_forward,
     gaudi_SeamlessM4TCodeHifiGan_get_output_hifigan_lengths,
@@ -287,6 +305,7 @@
     gaudi_T5Stack_forward,
 )
 from .table_transformer import gaudi_table_transformer_conv_encoder_forward
+from .video_llava import GaudiVideoLlavaForConditionalGeneration
 from .vision_encoder_decoder import (
     gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
 )
@@ -316,3 +335,4 @@
     gaudi_xglm_decoder_layer_forward,
     gaudi_xglm_model_forward,
 )
+from .xlm_roberta import gaudi_XLMRoberta_Sdpa_SelfAttention_forward
diff --git a/optimum/habana/transformers/models/baichuan/modeling_baichuan.py b/optimum/habana/transformers/models/baichuan/modeling_baichuan.py
index b733712fbb..ca9498e0f1 100644
--- a/optimum/habana/transformers/models/baichuan/modeling_baichuan.py
+++ b/optimum/habana/transformers/models/baichuan/modeling_baichuan.py
@@ -133,9 +133,9 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             self.inp_seq_len = inp_seq_len
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     def update(self, prev, cur, dim, idx, inp_seq_len):
diff --git a/optimum/habana/transformers/models/bart/modeling_bart.py b/optimum/habana/transformers/models/bart/modeling_bart.py
index 08ea48e1a5..2fdfbcc6d0 100644
--- a/optimum/habana/transformers/models/bart/modeling_bart.py
+++ b/optimum/habana/transformers/models/bart/modeling_bart.py
@@ -158,8 +158,7 @@ def gaudi_BartAttention_forward(
     if layer_head_mask is not None:
         if layer_head_mask.size() != (self.num_heads,):
             raise ValueError(
-                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                f" {layer_head_mask.size()}"
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
             )
         attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
         attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
@@ -458,9 +457,7 @@ def gaudi_BartDecoder_forward(
 
     # past_key_values_length
     past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-    tensor_past_key_values_length = (
-        token_idx - 1 if (use_cache and token_idx is not None) else torch.tensor(past_key_values_length)
-    )
+    tensor_past_key_values_length = token_idx - 1 if use_cache else torch.tensor(past_key_values_length)
 
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input)
diff --git a/optimum/habana/transformers/models/chatglm/modeling_chatglm.py b/optimum/habana/transformers/models/chatglm/modeling_chatglm.py
index 01c508aa5d..3afa86c4a9 100644
--- a/optimum/habana/transformers/models/chatglm/modeling_chatglm.py
+++ b/optimum/habana/transformers/models/chatglm/modeling_chatglm.py
@@ -148,9 +148,9 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             # self.cache = torch.zeros(shape, dtype=dtype, device=device)
             self.cache = torch.zeros(shape, dtype=torch.bfloat16, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     def update(self, prev, cur, dim, idx, inp_seq_len):
diff --git a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py
index ee271b7254..63804aa6aa 100644
--- a/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py
+++ b/optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py
@@ -20,20 +20,33 @@
 """PyTorch DeepSeekV2 model. Adapted from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/resolve/main/modeling_deepseek.py"""
 
 import math
+import os
 import warnings
 from typing import List, Optional, Tuple, Union
 
+import habana_frameworks.torch.core as htcore
 import torch
 import torch.distributed as dist
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+import torch.fx
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import PretrainedConfig
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.integrations.deepspeed import is_deepspeed_available
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+)
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+    SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import (
@@ -43,14 +56,43 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
-    replace_return_docstrings,
 )
 
-from ....distributed.tensorparallel import _all_reduce
 from ...modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
 from .configuration_deepseek_v2 import DeepseekV2Config
 
 
+_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DeepseekV2Config"
+
+# default expert number per slice for dynamic MoE
+SLICE_MAX_EXPERT = 80
+
+try:
+    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+
+    print("Using HPU fused kernel for apply_rotary_pos_emb")
+except ImportError:
+    print("Not using HPU fused kernel for apply_rotary_pos_emb")
+    FusedRoPE = None
+
+try:
+    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
+
+    print("Using HPU fused kernel for RMSNorm")
+except ImportError:
+    print("Not using HPU fused kernel for RMSNorm")
+    FusedRMSNorm = None
+
+try:
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+except ImportError:
+    print("Not using HPU fused scaled dot-product attention kernel.")
+    FusedSDPA = None
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "DeepseekV2Config"
@@ -68,6 +110,89 @@ def _get_unpad_data(attention_mask):
     )
 
 
+# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
 class DeepseekV2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -78,11 +203,23 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
+        if hidden_states.device.type == "hpu" and FusedRMSNorm:
+            # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
+            if hidden_states.dtype != self.weight.dtype:
+                orig_dtype = hidden_states.dtype
+                hidden_states = FusedRMSNorm.apply(
+                    hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon
+                )
+                return hidden_states.to(orig_dtype)
+            else:
+                hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
+                return hidden_states
+        else:
+            input_dtype = hidden_states.dtype
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+            return self.weight * hidden_states.to(input_dtype)
 
 
 ALL_LAYERNORM_LAYERS.append(DeepseekV2RMSNorm)
@@ -118,7 +255,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
+        if seq_len is not None and seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
 
         return (
@@ -273,6 +410,15 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.register_buffer("sin_cached", emb_sin, persistent=False)
 
 
+def apply_customized_rope(q, k, cos, sin, position_ids):
+    if q.device.type == "hpu" and FusedRoPE:
+        return FusedRoPE.apply(
+            q, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids
+        ), FusedRoPE.apply(k, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids)
+    else:
+        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+
+
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
@@ -282,11 +428,10 @@ def rotate_half(x):
 
 
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+def apply_rotary_pos_emb(q: torch.Tensor, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
         position_ids (`torch.Tensor`):
@@ -302,18 +447,19 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
 
     b, h, s, d = q.shape
     q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
 
-    b, h, s, d = k.shape
-    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
-
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
+    if q.device.type == "hpu" and FusedRoPE:
+        return FusedRoPE.apply(
+            q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
+        )
+    else:
+        cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+        sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        return q_embed
 
 
 class DeepseekV2MLP(nn.Module):
@@ -362,18 +508,21 @@ def forward(self, hidden_states):
         bsz, seq_len, h = hidden_states.shape
         ### compute gating score
         hidden_states = hidden_states.view(-1, h)
-        logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
+
+        logits = F.linear(hidden_states.type(torch.bfloat16), self.weight.type(torch.bfloat16), None).to(
+            dtype=torch.float32
+        )
         if self.scoring_func == "softmax":
-            scores = F.softmax(logits, dim=-1, dtype=torch.float32)
+            scores = logits.softmax(dim=-1, dtype=torch.float32)
         else:
             raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")
 
         ### select top-k experts
         if self.topk_method == "greedy":
-            topk_weight, topk_idx = torch.topk(scores, self.top_k, dim=-1)
+            topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=True)
         elif self.topk_method == "group_limited_greedy":
             group_scores = scores.view(bsz * seq_len, self.n_group, -1).max(dim=-1).values  # [n, n_group]
-            group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]  # [n, top_k_group]
+            group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=True)[1]  # [n, top_k_group]
             group_mask = torch.zeros_like(group_scores)  # [n, n_group]
             group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
             score_mask = (
@@ -382,7 +531,7 @@ def forward(self, hidden_states):
                 .reshape(bsz * seq_len, -1)
             )  # [n, e]
             tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
-            topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+            topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=True)
 
         ### norm gate to sum 1
         if self.top_k > 1 and self.norm_topk_prob:
@@ -446,7 +595,7 @@ def __init__(self, config):
         super().__init__()
         self.config = config
         self.num_experts_per_tok = config.num_experts_per_tok
-
+        self.experts_per_rank = config.n_routed_experts
         if hasattr(config, "ep_size") and config.ep_size > 1:
             assert config.ep_size == dist.get_world_size()
             self.ep_size = config.ep_size
@@ -477,66 +626,214 @@ def __init__(self, config):
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
             self.shared_experts = DeepseekV2MLP(config=config, intermediate_size=intermediate_size)
 
+        self.expert_slice = math.ceil(config.n_routed_experts / SLICE_MAX_EXPERT)
+        self.expert_chunk = self.config.n_routed_experts // self.expert_slice
+
     def forward(self, hidden_states):
         identity = hidden_states
         orig_shape = hidden_states.shape
         topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        flat_topk_idx = topk_idx.view(-1)
+        # we cast back to the input dtype
+        topk_weight = topk_weight.to(hidden_states.dtype)
+        batch = orig_shape[0]
+        sequence_length = orig_shape[1]
+        hidden_dim = orig_shape[2]
         if self.training:
-            hidden_states = hidden_states.repeat_interleave(self.num_experts_per_tok, dim=0)
-            y = torch.empty_like(hidden_states)
+            padded_weights = torch.zeros(
+                (batch * sequence_length, self.config.n_routed_experts),
+                dtype=topk_weight.dtype,
+                device=topk_weight.device,
+            )
+            padded_weights.scatter_(-1, topk_idx, topk_weight)
+            padded_weights = padded_weights.reshape(-1, sequence_length, self.config.n_routed_experts)
+            padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
+
+            final_hidden_states = torch.zeros(
+                (batch, sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+            )
             for i, expert in enumerate(self.experts):
-                y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
-            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-            y = y.to(hidden_states.dtype).view(*orig_shape)
-            y = AddAuxiliaryLoss.apply(y, aux_loss)
+                current_hidden_state = expert(hidden_states)
+                current_padded_weight = padded_weights[i]
+                final_hidden_states = (
+                    final_hidden_states
+                    + current_hidden_state.reshape(-1, sequence_length, hidden_dim) * current_padded_weight
+                )
+            final_hidden_states = final_hidden_states.type(hidden_states.dtype)
+            final_hidden_states = final_hidden_states.view(*orig_shape)
+            final_hidden_states = AddAuxiliaryLoss.apply(final_hidden_states, aux_loss)
         else:
-            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
-        if self.config.n_shared_experts is not None:
-            y = y + self.shared_experts(identity)
-        return y
+            final_hidden_states = torch.zeros(
+                (batch * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+            )
+            for idx in range(self.expert_slice):
+                experts_range = range(self.expert_chunk)
+                gate_proj_list = [
+                    self.experts[idx * self.expert_chunk + i].gate_proj.weight.squeeze()
+                    for i in experts_range
+                    if self.experts[idx * self.expert_chunk + i] is not None
+                ]
+                down_proj_list = [
+                    self.experts[idx * self.expert_chunk + i].down_proj.weight.squeeze()
+                    for i in experts_range
+                    if self.experts[idx * self.expert_chunk + i] is not None
+                ]
+                up_proj_list = [
+                    self.experts[idx * self.expert_chunk + i].up_proj.weight.squeeze()
+                    for i in experts_range
+                    if self.experts[idx * self.expert_chunk + i] is not None
+                ]
 
-    @torch.no_grad()
-    def moe_infer(self, x, topk_ids, topk_weight):
-        """
-        Rewrite DeepseekV2MoE.moe_infer: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py for static expert support
-        """
-        out = torch.zeros_like(x)
+                hidden_states_slice = torch.ops.hpu.mixture_of_experts(
+                    hidden_states=hidden_states,
+                    expert_routing_table=topk_idx,
+                    router_weights=topk_weight,
+                    w1=gate_proj_list,
+                    w2=up_proj_list,
+                    w3=down_proj_list,
+                    permuted_weights=True,
+                    activation="silu",
+                    experts_min=(self.expert_chunk * idx),
+                    experts_max=(self.expert_chunk * (idx + 1) - 1),
+                )
+                final_hidden_states = final_hidden_states + hidden_states_slice
+                htcore.mark_step()
+
+            if is_deepspeed_available():
+                from deepspeed import comm as dist
+
+                if dist.is_initialized():
+                    dist.all_reduce(final_hidden_states, op=dist.ReduceOp.SUM)
+
+            final_hidden_states = final_hidden_states.type(hidden_states.dtype)
+            final_hidden_states = final_hidden_states.reshape(-1, sequence_length, hidden_dim)
 
-        seq_len, hidden_dim = x.shape
-        num_experts = len(self.experts)
+        if self.config.n_shared_experts is not None:
+            final_hidden_states = final_hidden_states + self.shared_experts(identity)
 
-        padded_weights = torch.zeros((seq_len, num_experts), dtype=topk_weight.dtype, device=x.device)
-        padded_weights.scatter_(-1, topk_ids, topk_weight)
-        padded_weights = padded_weights.reshape(seq_len, num_experts)
-        padded_weights = padded_weights.permute(1, 0).unsqueeze(-1)
+        return final_hidden_states
 
-        # Loop over all available experts in the model and perform the computation on each expert
-        for i in range(self.experts_per_rank):
-            expert_idx = i + self.ep_rank * self.experts_per_rank
-            expert = self.experts[expert_idx]
-            padded_weight = padded_weights[expert_idx]
-            x_static = expert(x) * padded_weight
-            out += x_static
 
-        if self.ep_size > 1:
-            out = _all_reduce(out)
+class Matmul(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
 
-        return out
+    def forward(self, x, y):
+        return torch.matmul(x, y)
 
 
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+def gaudi_deepseekv2_repeat_kv(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    n_rep: int,
+):
     """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
+    The only differences are:
+    - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
+    - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
+    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
     """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
+    if n_rep == 1 or num_key_value_heads == 1:
+        return query_states, key_states, value_states, attention_mask
+
+    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
+    key_states = key_states.reshape(new_kv_shape)
+    value_states = value_states.reshape(new_kv_shape)
+
+    batch, q_heads, q_len, head_dim = query_states.shape
+    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
+    query_states = query_states.reshape(new_q_shape)
+
+    if attention_mask is not None:
+        # Add groups dim and set to 1
+        attention_mask = attention_mask.unsqueeze(1)
+
+    return query_states, key_states, value_states, attention_mask
+
+
+class KVCache(torch.nn.Module):
+    def __init__(self):
+        super(KVCache, self).__init__()
+        self.cache = None
+        self.inp_seq_len = -1
+
+    def allocate(self, inp_seq_len, dtype, device, shape):
+        if self.cache is None or self.cache.shape != shape:
+            self.inp_seq_len = inp_seq_len
+            self.cache = torch.zeros(shape, dtype=dtype, device=device)
+        else:
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
+            self.cache.fill_(0)
+
+    def update(self, prev, cur, dim, idx, inp_seq_len):
+        orig_cur = cur
+        if prev.shape == cur.shape:
+            prev.copy_(cur)
+            return orig_cur
+        if cur.shape[1] > 1 and cur.shape[1] <= prev.shape[1]:
+            # Initialize
+            prev[:, :inp_seq_len, :].copy_(cur)
+            return orig_cur
+        assert cur.shape[1] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
+
+        if idx is not None:
+            prev.index_copy_(dim, idx - 1, cur)
+            return prev
+        else:
+            return torch.cat((prev, cur), dim=dim)
+
+    def get_shape(self):
+        if self.cache is None:
+            return None
+        return self.cache.shape
+
+    def forward(self, cur, dim, idx):
+        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
+
+
+class ModuleFusedSDPA(torch.nn.Module):
+    def __init__(self, fusedSDPA, scale, attention_dropout, enable_recompute, flash_attention_fp8):
+        super().__init__()
+        self._hpu_kernel_fsdpa = fusedSDPA
+        self.scale = scale
+        self.attention_dropout = attention_dropout
+        self.enable_recompute = enable_recompute
+        self.flash_attention_fp8 = flash_attention_fp8
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        attn_mask,
+        dropout_p,
+        is_casual,
+        scale,
+        softmax_mode,
+        recompute_mode,
+        valid_sequence_lengths,
+        padding_side="left",
+    ):
+        return self._hpu_kernel_fsdpa.apply(
+            query,
+            key,
+            value,
+            attn_mask,
+            dropout_p,
+            is_casual,
+            scale,
+            softmax_mode,
+            recompute_mode,
+            valid_sequence_lengths,
+            padding_side,
+        )
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
@@ -595,6 +892,13 @@ def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
         )
         self._init_rope()
 
+        self.num_key_value_groups = self.num_heads // config.num_key_value_heads
+        self.matmul_qk = Matmul()
+        self.matmul_av = Matmul()
+        self.k_cache = KVCache()
+        self.v_cache = KVCache()
+        self.inp_seq_len = -1
+
         self.softmax_scale = self.q_head_dim ** (-0.5)
         if self.config.rope_scaling is not None:
             mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
@@ -603,6 +907,19 @@ def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
                 mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                 self.softmax_scale = self.softmax_scale * mscale * mscale
 
+        self.norm_factor = self.softmax_scale
+        self.fused_scaled_dot_product_attention = (
+            ModuleFusedSDPA(
+                FusedSDPA,
+                scale=self.norm_factor,
+                attention_dropout=self.attention_dropout,
+                enable_recompute=False,
+                flash_attention_fp8=getattr(config, "flash_attention_fp8", False),
+            )
+            if FusedSDPA
+            else None
+        )
+
     def _init_rope(self):
         if self.config.rope_scaling is None:
             self.rotary_emb = DeepseekV2RotaryEmbedding(
@@ -649,107 +966,331 @@ def _init_rope(self):
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
 
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        compressed_kv_cache_shape = (batch_size, max_seq_len, self.kv_lora_rank)
+        k_pe_cache_shape = (batch_size, max_seq_len, self.qk_rope_head_dim)
+        device = self.kv_a_proj_with_mqa.weight.device
+        dtype = self.config.torch_dtype
+
+        self.k_cache.allocate(inp_seq_len, dtype, device, compressed_kv_cache_shape)
+        self.v_cache.allocate(inp_seq_len, dtype, device, k_pe_cache_shape)
+
+    def update_sincos_cache(self, seq_len):
+        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # This helps in avoiding creation of these caches during actual model forward pass and
+        # reduce memory consumption and improve performance.
+        if seq_len > self.max_position_embeddings:
+            self.max_position_embeddings = seq_len
+            _, _ = self.rotary_emb(self.k_b_proj.weight, seq_len=seq_len)
+
+    def reorder(self, tensor, beam_idx, dim_a, dim_b):
+        updated = tensor.index_select(0, beam_idx)
+        tensor.copy_(updated)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        if self.k_cache.cache is None:
+            return (None, None)
+
+        head_dim = self.k_cache.cache.size(-1)
+        seq_length = self.k_cache.cache.size(-2)
+        self.reorder(self.k_cache.cache, beam_idx, seq_length, head_dim)
+        self.reorder(self.v_cache.cache, beam_idx, seq_length, head_dim)
+        return (self.k_cache.cache.shape, self.v_cache.cache.shape)
+
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim).transpose(1, 2).contiguous()
 
+    def split_kv_b_proj(self):
+        kv_b_proj_weight = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
+        self.q_absorb = kv_b_proj_weight[:, : self.qk_nope_head_dim, :].unsqueeze(0).transpose(0, 1)
+        self.out_absorb = kv_b_proj_weight[:, self.qk_nope_head_dim :, :].unsqueeze(0)
+
+    def compress_kv(
+        self,
+        hidden_states_kv: torch.Tensor,
+        kv_position_ids: torch.LongTensor,
+        past_key_value: Optional[Cache] = None,
+    ) -> torch.Tensor:
+        # return the RoPE'ed & compressed kv
+        bsz, kv_seq_len, _ = hidden_states_kv.size()
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states_kv)
+        compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        compressed_kv = self.kv_a_layernorm(compressed_kv)
+        k_pe = k_pe.view(bsz, kv_seq_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb.cos_cached, self.rotary_emb.sin_cached
+        k_pe = apply_rotary_pos_emb(k_pe, cos, sin, kv_position_ids).view(bsz, kv_seq_len, self.qk_rope_head_dim)
+        return compressed_kv, k_pe
+
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: int = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
+        flash_attention_fast_softmax: Optional[bool] = False,
+        valid_sequence_lengths: Optional[torch.Tensor] = None,
+        num_virtual_tokens: int = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """
-        Copied from DeepseekV2Attention.forward: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
-        deltas are:
-        - add  token_idx
-        - optimize KV cache
+        Attention masks and past cache are removed.
+        Input:
+        - hidden_states: [bsz, q_len, hidden_size]
+        - position_ids: [bsz, q_len]
         """
 
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.q_lora_rank is None:
-            q = self.q_proj(hidden_states)
-        else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-        q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        if self.training:
+            if "padding_mask" in kwargs:
+                warnings.warn(
+                    "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+                )
+            bsz, q_len, _ = hidden_states.size()
+            if self.q_lora_rank is None:
+                q = self.q_proj(hidden_states)
+            else:
+                q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+            q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+            q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+            compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+            compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+            k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+            kv = (
+                self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+                .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+                .transpose(1, 2)
+            )
 
-        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-        kv = (
-            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-            .transpose(1, 2)
-        )
+            k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            kv_seq_len = value_states.shape[-2]
+            if past_key_value is not None:
+                if self.layer_idx is None:
+                    raise ValueError(
+                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                        "with a layer index."
+                    )
 
-        k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        kv_seq_len = value_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
+                if token_idx is None:
+                    if hasattr(past_key_value, "get_usable_length"):
+                        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+                    else:
+                        kv_seq_len += past_key_value[0].shape[-2]
+                else:
+                    if num_virtual_tokens is not None and num_virtual_tokens == past_key_value[0].shape[-2]:
+                        kv_seq_len = past_key_value[0].shape[-2] + kv_seq_len
+                    else:
+                        kv_seq_len = past_key_value[0].shape[-2]
+
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            q_pe, k_pe = apply_customized_rope(q_pe, k_pe, cos, sin, position_ids)
+
+            query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+            query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+            query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+            key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+            key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+            key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+
+            if past_key_value is not None:
+                cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, cache_kwargs
                 )
-            if token_idx is None:
-                kv_seq_len += past_key_value[0].shape[-2]
-            else:
-                kv_seq_len = past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-
-        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-
-        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-        if past_key_value is not None:
-            if token_idx is None:
-                key_states = torch.cat([past_key_value[0], key_states], dim=2)
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+            # optimization
+            if use_flash_attention and FusedSDPA is not None:
+                if q_len == 1:
+                    # next token
+                    attn_output = self.fused_scaled_dot_product_attention(
+                        query_states,
+                        key_states,
+                        value_states,
+                        attention_mask,
+                        0.0,
+                        False,
+                        None,
+                        "None",
+                        False,
+                        None,
+                        "None",
+                    )
+                else:
+                    # first token
+                    softmax_mode = "fast" if flash_attention_fast_softmax else "None"
+                    if flash_attention_causal_mask:
+                        attn_output = self.fused_scaled_dot_product_attention(
+                            query_states,
+                            key_states,
+                            value_states,
+                            None,
+                            0.0,
+                            True,
+                            None,
+                            softmax_mode,
+                            flash_attention_recompute,
+                            valid_sequence_lengths,
+                            "left",
+                        )
+                    else:
+                        attn_output = self.fused_scaled_dot_product_attention(
+                            query_states,
+                            key_states,
+                            value_states,
+                            attention_mask,
+                            0.0,
+                            False,
+                            None,
+                            softmax_mode,
+                            flash_attention_recompute,
+                            None,
+                            "None",
+                        )
+
             else:
-                past_key_value[0].index_add_(
-                    2, token_idx - 1, key_states - torch.index_select(past_key_value[0], 2, token_idx - 1)
+                query_states, key_states, value_states, attention_mask = gaudi_deepseekv2_repeat_kv(
+                    query_states, key_states, value_states, attention_mask, self.num_key_value_groups
                 )
-                past_key_value[1].index_add_(
-                    2, token_idx - 1, value_states - torch.index_select(past_key_value[1], 2, token_idx - 1)
+
+                attn_weights = self.matmul_qk(query_states, key_states.transpose(-2, -1)) * self.softmax_scale
+                htcore.mark_step()
+
+                if attention_mask is not None:  # no matter the length, we just slice it
+                    causal_mask = attention_mask
+                    if cache_position is not None:
+                        causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
+                    attn_weights = attn_weights + causal_mask.float()
+
+                if attn_softmax_bf16:
+                    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype)
+                else:
+                    # upcast attention to fp32
+                    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+                        query_states.dtype
+                    )
+                attn_weights = torch.nn.functional.dropout(
+                    attn_weights, p=self.attention_dropout, training=self.training
                 )
-                key_states = past_key_value[0]
-                value_states = past_key_value[1]
-        past_key_value = (key_states, value_states) if use_cache else None
+                attn_output = self.matmul_av(attn_weights, value_states)
+        else:
+            hidden_states_q = hidden_states
+            hidden_states_kv = hidden_states
+            self.split_kv_b_proj()
+            q_position_ids = position_ids
+            kv_position_ids = position_ids
+            bsz, q_len, _ = hidden_states_q.size()
+
+            if self.q_lora_rank is None:
+                q = self.q_proj(hidden_states_q)
+            else:
+                q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states_q)))
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
+            q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
 
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        assert attention_mask is not None
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+            kv_seq_len = q_pe.shape[-2]
+
+            if past_key_value is not None:
+                if self.layer_idx is None:
+                    raise ValueError(
+                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                        "with a layer index."
+                    )
+                if token_idx is None:
+                    if hasattr(past_key_value, "get_usable_length"):
+                        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+                    else:
+                        kv_seq_len += past_key_value[0].shape[-2]
+                else:
+                    if reuse_cache:
+                        kv_seq_len = past_key_value[0][-2]
+                    else:
+                        kv_seq_len = past_key_value[0].shape[-2]
+
+            cos, sin = self.rotary_emb(q_pe, seq_len=kv_seq_len)
+            q_pe = apply_rotary_pos_emb(q_pe, cos, sin, q_position_ids)
+            q_nope = torch.matmul(q_nope.transpose(0, 1), self.q_absorb).transpose(0, 1)
+            compressed_kv, k_pe = self.compress_kv(hidden_states_kv, kv_position_ids)
+
+            # update & get all compressed_kv, k_pe
+            if use_cache:
+                if reuse_cache:
+                    if past_key_value is not None and isinstance(past_key_value[0], torch.Tensor):
+                        # prefix tuning case. attach past_key_value to generate first token.
+                        compressed_kv = torch.cat((past_key_value[0], compressed_kv), -2)
+                        k_pe = torch.cat((past_key_value[1], k_pe), -2)
+
+                    compressed_kv = self.k_cache(compressed_kv, 1, token_idx)
+
+                    k_pe = self.v_cache(k_pe, 1, token_idx)
+                    past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
+
+                else:
+                    if past_key_value is None:
+                        dtype_1 = hidden_states.dtype
+                        device_1 = hidden_states.device
+                        past_key = torch.zeros(compressed_kv.shape, dtype=dtype_1, device=device_1)
+                        past_value = torch.zeros(k_pe.shape, dtype=dtype_1, device=device_1)
+                        past_key_value = (past_key, past_value)
+                    compressed_kv = self.k_cache.update(
+                        past_key_value[0], compressed_kv, 1, token_idx, self.inp_seq_len
+                    )
+                    k_pe = self.v_cache.update(past_key_value[1], k_pe, 1, token_idx, self.inp_seq_len)
+
+                    if token_idx is None:
+                        past_key_value = (compressed_kv, k_pe)
+
+                if cache_idx is not None and q_len == 1:
+                    compressed_kv = compressed_kv[:, :cache_idx, :]
+
+                    k_pe = k_pe[:, :cache_idx, :]
+                    if attention_mask is not None:
+                        attention_mask = attention_mask[:, :, :, :cache_idx]
+
+                    kv_seq_len = compressed_kv.shape[-2]
+            else:
+                past_key_value = None
+
+            kv_seq_len = compressed_kv.size(1)
+
+            k_pe = k_pe.view(bsz, 1, kv_seq_len, self.qk_rope_head_dim)
+
+            attn_weights = (
+                torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)
+            ) * self.softmax_scale
+
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
                 )
-            attn_weights = attn_weights + attention_mask
+            assert attention_mask is not None
+            if attention_mask is not None:
+                attn_weights = attn_weights + attention_mask
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q_nope.dtype)
 
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
+            attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+            attn_output = torch.einsum("bhql,blc->bhqc", attn_weights, compressed_kv)
+
+            attn_output = torch.matmul(attn_output.permute(2, 1, 0, 3), self.out_absorb.mT).permute(2, 1, 0, 3)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
             raise ValueError(
@@ -788,6 +1329,15 @@ def __init__(self, config: DeepseekV2Config, layer_idx: int):
         self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return self.self_attn.reorder_kv_cache(beam_idx)
+
+    def update_sincos_cache(self, seq_len):
+        self.self_attn.update_sincos_cache(seq_len)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -797,13 +1347,18 @@ def forward(
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: int = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
+        flash_attention_fast_softmax: Optional[bool] = False,
+        valid_sequence_lengths: Optional[torch.Tensor] = None,
+        num_virtual_tokens: int = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from DeepseekV2DecoderLayer.forward: https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py
-        The deltas are:
-        - add token_idx
-        """
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -835,6 +1390,16 @@ def forward(
             output_attentions=output_attentions,
             use_cache=use_cache,
             token_idx=token_idx,
+            reuse_cache=reuse_cache,
+            cache_idx=cache_idx,
+            cache_position=cache_position,
+            attn_softmax_bf16=attn_softmax_bf16,
+            use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
+            flash_attention_causal_mask=flash_attention_causal_mask,
+            flash_attention_fast_softmax=flash_attention_fast_softmax,
+            valid_sequence_lengths=valid_sequence_lengths,
+            num_virtual_tokens=num_virtual_tokens,
             **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -842,7 +1407,10 @@ def forward(
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
+        if isinstance(self.mlp, DeepseekV2MoE):
+            hidden_states = self.mlp(hidden_states)
+        else:
+            hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
 
         outputs = (hidden_states,)
@@ -881,7 +1449,7 @@ class DeepseekV2PreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["DeepseekV2DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
+    _supports_flash_attn_2 = False
     _supports_cache_class = True
 
     def _init_weights(self, module):
@@ -974,6 +1542,7 @@ def __init__(self, config: DeepseekV2Config):
         self.layers = nn.ModuleList(
             [DeepseekV2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
+        self._attn_implementation = "eager"
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -981,6 +1550,17 @@ def __init__(self, config: DeepseekV2Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        for layer in self.layers:
+            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
+
+    def update_sincos_cache(self, seq_len):
+        for layer in self.layers:
+            layer.update_sincos_cache(seq_len)
+
     def get_input_embeddings(self):
         return self.embed_tokens
 
@@ -998,8 +1578,20 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         token_idx: Optional[torch.Tensor] = None,
+        attn_softmax_bf16: Optional[bool] = False,
+        reuse_cache: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
+        flash_attention_fast_softmax: Optional[bool] = False,
+        cache_idx: int = None,
+        lazy_mode: Optional[bool] = True,
+        valid_sequence_lengths: Optional[torch.Tensor] = None,
+        num_virtual_tokens: int = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1026,32 +1618,53 @@ def forward(
                 )
                 use_cache = False
 
-        past_key_values_length = 0
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0)
-
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        # 4d mask is passed through the layers
-        attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        if attention_mask is not None:
-            attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
+        ignore_cache_position = True  # Ignoring cache position for HPU
+        use_new_cache = False  # Ignoring new Cache path for HPU
+
+        past_seen_tokens = 0
+
+        if past_key_values is not None and use_cache:  # kept for BC (cache positions)
+            if reuse_cache:
+                if isinstance(past_key_values[0][0], torch.Tensor):
+                    past_seen_tokens = past_key_values[0][0].shape[2]
+                else:
+                    past_seen_tokens = past_key_values[0][0][2]
+            else:
+                if use_new_cache:
+                    if not isinstance(past_key_values, StaticCache):
+                        past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                    past_seen_tokens = past_key_values.get_seq_length()
+                else:
+                    if past_key_values[0] is not None:  ##added for (None, None)
+                        past_seen_tokens = past_key_values[0][0].shape[2]
+
+        if ignore_cache_position is False:
+            if cache_position is None:
+                if isinstance(past_key_values, StaticCache):
+                    raise ValueError("cache_position is a required argument when using StaticCache.")
+                cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+                )
+            if position_ids is None and cache_position:
+                position_ids = cache_position.unsqueeze(0)
+
+        else:
+            if position_ids is None:
+                position_ids = torch.arange(
+                    past_seen_tokens, seq_length + past_seen_tokens, dtype=torch.long, device=inputs_embeds.device
+                )
+                position_ids = position_ids.unsqueeze(0)
+            cache_position = None
+
+        causal_mask = _gaudi_prepare_4d_causal_attention_mask(
+            attention_mask,
+            input_ids.shape if input_ids is not None else (batch_size, seq_length),
+            inputs_embeds,
+            past_seen_tokens,
+        )
 
         # embed positions
         hidden_states = inputs_embeds
@@ -1059,34 +1672,70 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = () if not use_new_cache else None
 
-        for idx, decoder_layer in enumerate(self.layers):
+        if lazy_mode:
+            htcore.mark_step()
+
+        for layer_idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    attention_mask,
+                    causal_mask,
                     position_ids,
                     past_key_values,
                     output_attentions,
                     use_cache,
+                    token_idx,
+                    reuse_cache,
+                    cache_idx,
+                    cache_position,
+                    attn_softmax_bf16,
+                    use_flash_attention,
+                    flash_attention_recompute,
+                    flash_attention_causal_mask,
+                    flash_attention_fast_softmax,
+                    valid_sequence_lengths,
+                    num_virtual_tokens,
                 )
             else:
+                if (
+                    lazy_mode
+                    and not self.training
+                    and (torch.distributed.is_initialized() is False or torch.distributed.get_world_size() == 1)
+                ):
+                    htcore.mark_step()
+
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
+                    attention_mask=causal_mask,
                     position_ids=position_ids,
-                    past_key_value=past_key_value,
+                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
                     output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
                     use_cache=use_cache,
+                    cache_position=cache_position,
                     token_idx=token_idx,
+                    attn_softmax_bf16=attn_softmax_bf16,
+                    reuse_cache=reuse_cache,
+                    use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
+                    flash_attention_causal_mask=flash_attention_causal_mask,
+                    flash_attention_fast_softmax=flash_attention_fast_softmax,
+                    cache_idx=cache_idx,
+                    num_virtual_tokens=num_virtual_tokens,
                 )
+                if (
+                    lazy_mode
+                    and not self.training
+                    and (torch.distributed.is_initialized() is False or torch.distributed.get_world_size() == 1)
+                ):
+                    htcore.mark_step()
 
             hidden_states = layer_outputs[0]
 
@@ -1096,20 +1745,32 @@ def forward(
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
         hidden_states = self.norm(hidden_states)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
+            )
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
+            router_logits=all_router_logits,
         )
 
 
@@ -1143,8 +1804,64 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
-    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
+        self.kv_cache_len = max_seq_len
+
+    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
+        return self.model.reorder_kv_cache(beam_idx)
+
+    def update_sincos_cache(self, seq_len):
+        self.model.update_sincos_cache(seq_len)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=False,
+                proxies=None,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder="",
+                _from_auto=False,
+                _from_pipeline=None,
+                **kwargs,
+            )
+
+        return super(DeepseekV2ForCausalLM, cls).from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            **kwargs,
+        )
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1155,10 +1872,23 @@ def forward(
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = False,
+        output_router_logits: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        reuse_cache: Optional[bool] = None,
+        flash_attention_recompute: Optional[bool] = False,
+        cache_idx: int = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        trim_logits: Optional[bool] = False,
+        attn_softmax_bf16: Optional[bool] = False,
+        use_flash_attention: Optional[bool] = False,
+        flash_attention_causal_mask: Optional[bool] = False,
+        flash_attention_fast_softmax: Optional[bool] = False,
+        valid_sequence_lengths: torch.Tensor = None,
+        lazy_mode: Optional[bool] = True,
+        num_virtual_tokens: int = None,
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1194,11 +1924,31 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
             return_dict=return_dict,
+            cache_position=cache_position,
             token_idx=token_idx,
+            attn_softmax_bf16=attn_softmax_bf16,
+            reuse_cache=reuse_cache,
+            use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
+            flash_attention_causal_mask=flash_attention_causal_mask,
+            flash_attention_fast_softmax=flash_attention_fast_softmax,
+            cache_idx=cache_idx,
+            lazy_mode=lazy_mode,
+            valid_sequence_lengths=valid_sequence_lengths,
+            num_virtual_tokens=num_virtual_tokens,
         )
 
         hidden_states = outputs[0]
+
+        _, seq_len, _ = hidden_states.shape
+        if seq_len > 1 and trim_logits and not self.training:
+            if token_idx is not None:
+                hidden_states = hidden_states.index_select(1, token_idx - 1)
+            else:
+                hidden_states = hidden_states[:, -1, :]
+
         logits = self.lm_head(hidden_states)
         logits = logits.float()
 
@@ -1215,16 +1965,31 @@ def forward(
             shift_labels = shift_labels.to(shift_logits.device)
             loss = loss_fct(shift_logits, shift_labels)
 
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
         if not return_dict:
             output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
             return (loss,) + output if loss is not None else output
 
-        return CausalLMOutputWithPast(
+        return MoeCausalLMOutputWithPast(
             loss=loss,
+            aux_loss=aux_loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
         )
 
     def prepare_inputs_for_generation(
@@ -1233,44 +1998,33 @@ def prepare_inputs_for_generation(
         past_key_values=None,
         attention_mask=None,
         inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        token_idx=None,
         **kwargs,
     ):
-        token_idx = kwargs.get("token_idx")
-        past_length = 0
-        max_cache_length = None
+        reuse_cache = kwargs.get("reuse_cache")
+        bucket_internal = kwargs.get("bucket_internal")
+
         if past_key_values is not None:
             if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+                idx = token_idx + kwargs.get("inputs_embeds_offset", 0) - 1
+                input_ids = torch.index_select(input_ids, 1, idx)
             else:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
+                if inputs_embeds is not None:  # Exception 1
+                    input_ids = input_ids[:, -cache_position.shape[0] :]
+                elif (
+                    input_ids.shape[1] != cache_position.shape[0]
+                ):  # Default case (the "else", a no op, is Exception 2)
+                    input_ids = input_ids[:, cache_position]
+        elif (reuse_cache or bucket_internal) and token_idx is not None:
+            # KV cache is pre allocated with reuse cache or will be padded with bucket internal
+            # hence for the 1st token we can slice the inputs till token idx for the fwd pass.
+            input_ids = input_ids[:, :token_idx]
+            attention_mask = attention_mask[:, :token_idx]
 
-        position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
@@ -1281,19 +2035,159 @@ def prepare_inputs_for_generation(
                 else:
                     position_ids = position_ids[:, -input_ids.shape[1] :]
 
+        # keep cache_position implementation as None for HPU
+        cache_position = None
+
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids.contiguous()}
 
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
         model_inputs.update(
             {
                 "position_ids": position_ids,
+                "cache_position": cache_position,
                 "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
+                "use_cache": use_cache,
                 "attention_mask": attention_mask,
                 "token_idx": token_idx,
+                "trim_logits": kwargs.get("trim_logits"),
+                "attn_softmax_bf16": kwargs.get("attn_softmax_bf16"),
+                "reuse_cache": reuse_cache,
+                "use_flash_attention": kwargs.get("use_flash_attention"),
+                "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
+                "flash_attention_causal_mask": kwargs.get("flash_attention_causal_mask"),
+                "flash_attention_fast_softmax": kwargs.get("flash_attention_fast_softmax"),
+                "valid_sequence_lengths": kwargs.get("valid_sequence_lengths"),
+                "cache_idx": kwargs.get("cache_idx"),
+                "lazy_mode": kwargs.get("lazy_mode"),
+                "num_virtual_tokens": kwargs.get("num_virtual_tokens"),
             }
         )
         return model_inputs
+
+
+@add_start_docstrings(
+    """
+    The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    DeepseekV2_START_DOCSTRING,
+)
+class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = DeepseekV2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/optimum/habana/transformers/models/detr/__init__.py b/optimum/habana/transformers/models/detr/__init__.py
index 8d81ab258f..cc6452cf40 100644
--- a/optimum/habana/transformers/models/detr/__init__.py
+++ b/optimum/habana/transformers/models/detr/__init__.py
@@ -1,3 +1,9 @@
 from .modeling_detr import (
     gaudi_DetrConvModel_forward,
+    gaudi_DetrHungarianMatcher_forward,
+    gaudi_DetrLoss_forward,
+    gaudi_DetrLoss_get_targets_without_no_objects,
+    gaudi_DetrLoss_loss_boxes,
+    gaudi_DetrLoss_loss_cardinality,
+    gaudi_DetrLoss_loss_labels,
 )
diff --git a/optimum/habana/transformers/models/detr/modeling_detr.py b/optimum/habana/transformers/models/detr/modeling_detr.py
index 079898d2d7..e23699fbf3 100644
--- a/optimum/habana/transformers/models/detr/modeling_detr.py
+++ b/optimum/habana/transformers/models/detr/modeling_detr.py
@@ -1,3 +1,15 @@
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from transformers.models.detr.modeling_detr import center_to_corners_format, generalized_box_iou
+from transformers.utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+
+
 def gaudi_DetrConvModel_forward(self, pixel_values, pixel_mask):
     """
     Copied from modeling_detr: https://github.com/huggingface/transformers/blob/main/src/transformers/models/detr/modeling_detr.py#L398
@@ -17,3 +29,181 @@ def gaudi_DetrConvModel_forward(self, pixel_values, pixel_mask):
         pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype).to("hpu"))
 
     return out, pos
+
+
+def gaudi_DetrLoss_get_targets_without_no_objects(self, targets):
+    target_copy = targets.copy()
+    tcopy_iter = iter(target_copy)
+    for v in targets:
+        entries = []
+        for x in v["class_labels"].to("cpu").numpy():
+            if x != self.num_classes:
+                entries.append(x)
+        y = next(tcopy_iter)
+        y["class_labels"] = torch.as_tensor(entries)
+        y["boxes"] = v["boxes"].to("cpu")[0 : len(y["class_labels"])]
+    return target_copy
+
+
+@torch.no_grad()
+def gaudi_DetrHungarianMatcher_forward(self, outputs, targets):
+    """
+    Copied from https://github.com/huggingface/transformers/tree/v4.40.2
+    https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/models/detr/modeling_detr.py#L2287
+    The modifications are:
+        - Convert cost_matrix on HPU to float32 before moving it to CPU
+    """
+    batch_size, num_queries = outputs["logits"].shape[:2]
+
+    # We flatten to compute the cost matrices in a batch
+    out_prob = outputs["logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+    out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+    # Also concat the target labels and boxes
+    target_ids = torch.cat([v["class_labels"] for v in targets])
+    target_bbox = torch.cat([v["boxes"] for v in targets])
+
+    # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+    # but approximate it in 1 - proba[target class].
+    # The 1 is a constant that doesn't change the matching, it can be omitted.
+    class_cost = -out_prob[:, target_ids]
+
+    # HPU Eager mode requires tensors to be on the same device
+    out_bbox = out_bbox.to(target_bbox.device)
+    class_cost = class_cost.to(target_bbox.device)
+
+    # Compute the L1 cost between boxes
+    bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+    # Compute the giou cost between boxes
+    giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+    # Final cost matrix
+    cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+    cost_matrix = cost_matrix.view(batch_size, num_queries, -1).to(torch.float32).cpu()
+
+    sizes = [len(v["boxes"]) for v in targets]
+    indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+    return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+def gaudi_DetrLoss_loss_labels(self, outputs, targets, indices, num_boxes):
+    """
+    Copied from https://github.com/huggingface/transformers/tree/v4.40.2
+    https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/models/detr/modeling_detr.py#L2074
+    The modifications are:
+        - Move cross entropy computation to CPU
+    """
+    if "logits" not in outputs:
+        raise KeyError("No logits were found in the outputs")
+    source_logits = outputs["logits"]
+
+    idx = self._get_source_permutation_idx(indices)
+    target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+    target_classes = torch.full(
+        source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=target_classes_o.device
+    )
+    target_classes[idx] = target_classes_o
+
+    if source_logits.device == torch.device("cpu"):
+        loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
+    else:
+        source_logits_cpu = source_logits.to("cpu").float()
+        target_classes_cpu = target_classes.to("cpu")
+        empty_weight_cpu = self.empty_weight.to("cpu").float()
+        loss_ce_cpu = nn.functional.cross_entropy(
+            source_logits_cpu.transpose(1, 2), target_classes_cpu, empty_weight_cpu
+        )
+        loss_ce = loss_ce_cpu.to("hpu")
+    losses = {"loss_ce": loss_ce}
+
+    return losses
+
+
+def gaudi_DetrLoss_loss_boxes(self, outputs, targets, indices, num_boxes):
+    """
+    Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+    Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+    are expected in format (center_x, center_y, w, h), normalized by the image size.
+    """
+    if "pred_boxes" not in outputs:
+        raise KeyError("No predicted boxes found in outputs")
+    idx = self._get_source_permutation_idx(indices)
+    source_boxes = outputs["pred_boxes"][idx]
+    target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+    # HPU eager mode requires both source and target tensors to be on same device
+    source_boxes = source_boxes.to(target_boxes.device)
+    loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+
+    losses = {}
+    losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+    loss_giou = 1 - torch.diag(
+        generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+    )
+    losses["loss_giou"] = loss_giou.sum() / num_boxes
+    return losses
+
+
+@torch.no_grad()
+def gaudi_DetrLoss_loss_cardinality(self, outputs, targets, indices, num_boxes):
+    """
+    Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+    This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+    """
+    logits = outputs["logits"]
+    target_lengths = torch.as_tensor([len(v) for v in targets], device="cpu")
+
+    # Count the number of predictions that are NOT "no-object" (which is the last class)
+    card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+    card_err = nn.functional.l1_loss(card_pred.to("cpu").float(), target_lengths.float())
+    losses = {"cardinality_error": card_err}
+    return losses
+
+
+def gaudi_DetrLoss_forward(self, outputs, targets):
+    """
+    This performs the loss computation.
+    Args:
+            outputs (`dict`, *optional*):
+            Dictionary of tensors, see the output specification of the model for the format.
+            targets (`List[dict]`, *optional*):
+            List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+            losses applied, see each loss' doc.
+    """
+    outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+    # Retrieve the matching between the outputs of the last layer and the targets
+    device = outputs["logits"].device
+    target_copy = self.get_targets_without_no_objects(targets)
+    indices = self.matcher(outputs_without_aux, target_copy)
+
+    # Compute the average number of target boxes across all nodes, for normalization purposes
+    num_boxes = sum(len(t["class_labels"]) for t in target_copy)
+    num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+    world_size = 1
+    if is_accelerate_available():
+        if PartialState._shared_state != {}:
+            num_boxes = reduce(num_boxes)
+            world_size = PartialState().num_processes
+    num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
+    # Compute all the requested losses
+    losses = {}
+    for loss in self.losses:
+        losses.update(self.get_loss(loss, outputs, target_copy, indices, num_boxes))
+
+    # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+    if "auxiliary_outputs" in outputs:
+        for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+            indices = self.matcher(auxiliary_outputs, target_copy)
+            for loss in self.losses:
+                if loss == "masks":
+                    # Intermediate masks losses are too costly to compute, we ignore them.
+                    continue
+                l_dict = self.get_loss(loss, auxiliary_outputs, target_copy, indices, num_boxes)
+                l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+    for k in losses.keys():
+        losses[k] = losses[k].to(device)
+    return losses
diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py
index 8895f32459..5afa728c4b 100644
--- a/optimum/habana/transformers/models/falcon/modeling_falcon.py
+++ b/optimum/habana/transformers/models/falcon/modeling_falcon.py
@@ -1048,7 +1048,9 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if use_flash_attention:
-            assert FusedSDPA, "`use_flash_attention` is True, but cannot find FusedSDPA. Please import it as `from habana_frameworks.torch.hpex.kernels import FusedSDPA` or set use_flash_attention to False (at the expense of a possible performance degradation)."
+            assert FusedSDPA, (
+                "`use_flash_attention` is True, but cannot find FusedSDPA. Please import it as `from habana_frameworks.torch.hpex.kernels import FusedSDPA` or set use_flash_attention to False (at the expense of a possible performance degradation)."
+            )
         if flash_attention_recompute:
             assert use_flash_attention, "flash_attention_recompute is set, but use_flash_attention is not"
         if flash_attention_causal_mask:
diff --git a/optimum/habana/transformers/models/gemma/modeling_gemma.py b/optimum/habana/transformers/models/gemma/modeling_gemma.py
index 532539065d..cee3796aff 100755
--- a/optimum/habana/transformers/models/gemma/modeling_gemma.py
+++ b/optimum/habana/transformers/models/gemma/modeling_gemma.py
@@ -132,9 +132,9 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             self.inp_seq_len = inp_seq_len
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     def update(self, prev, cur, dim, idx, inp_seq_len):
diff --git a/optimum/habana/transformers/models/gemma2/modeling_gemma2.py b/optimum/habana/transformers/models/gemma2/modeling_gemma2.py
index 4196775c19..5905e8bf3a 100755
--- a/optimum/habana/transformers/models/gemma2/modeling_gemma2.py
+++ b/optimum/habana/transformers/models/gemma2/modeling_gemma2.py
@@ -214,9 +214,9 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             self.inp_seq_len = inp_seq_len
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     def update(self, prev, cur, dim, idx, inp_seq_len):
diff --git a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 9f451256c9..f01255624f 100644
--- a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -306,9 +306,9 @@ def forward(
         - optimize KV cache
         """
         if use_flash_attention:
-            assert (
-                self.fused_scaled_dot_product_attention is not None
-            ), "Can't load HPU fused scaled dot-product attention kernel. Please retry without flash attention"
+            assert self.fused_scaled_dot_product_attention is not None, (
+                "Can't load HPU fused scaled dot-product attention kernel. Please retry without flash attention"
+            )
 
         if encoder_hidden_states is not None:
             if not hasattr(self, "q_attn") or not self.is_cross_attention:
@@ -353,9 +353,9 @@ def forward(
             present = torch.cat((key, value), dim=-1) if use_cache else None
         else:
             assert token_idx is not None, "Invalid parameters: token_idx is None at decode stage with bucket_internal"
-            assert (
-                layer_past is not None
-            ), "Invalid parameters: layer_past is None at decode stage with bucket_internal"
+            assert layer_past is not None, (
+                "Invalid parameters: layer_past is None at decode stage with bucket_internal"
+            )
 
             past_key, past_value = layer_past.split((self.head_dim, self.head_dim), dim=-1)
             key = past_key.index_copy_(1, token_idx - 1, key)
diff --git a/optimum/habana/transformers/models/gptj/modeling_gptj.py b/optimum/habana/transformers/models/gptj/modeling_gptj.py
index 3927e1feb9..2617a8e66a 100644
--- a/optimum/habana/transformers/models/gptj/modeling_gptj.py
+++ b/optimum/habana/transformers/models/gptj/modeling_gptj.py
@@ -38,9 +38,9 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             self.inp_seq_len = inp_seq_len
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     def update(self, prev, cur, dim, idx, inp_seq_len):
diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
index f069b8d300..130490d5c9 100755
--- a/optimum/habana/transformers/models/llama/modeling_llama.py
+++ b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -21,7 +21,8 @@
 )
 from transformers.utils import is_torchdynamo_compiling
 
-from .... import distributed, parallel_state
+from .... import distributed
+from ....distributed import parallel_state
 from ....distributed.strategy import DistributedStrategy, NoOpStrategy
 from ....distributed.tensorparallel import (
     reduce_from_tensor_model_parallel_region,
@@ -396,22 +397,25 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             self.inp_seq_len = inp_seq_len
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     @staticmethod
     def update(prev, cur, dim, idx, inp_seq_len):
-        orig_cur = cur
-        if prev.shape == cur.shape:
-            prev.copy_(cur)
-            return orig_cur
-        if idx is not None and cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-            # Initialize
-            prev[:, :, :inp_seq_len, :].copy_(cur)
-            return orig_cur
+        if inp_seq_len != -1:
+            # reuse cache logic
+            orig_cur = cur
+            if prev.shape == cur.shape:
+                prev.copy_(cur)
+                return orig_cur
+            if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
+                # Initialize
+                prev[:, :, :inp_seq_len, :].copy_(cur)
+                return orig_cur
         if idx is not None:
+            # 2+ tokenizer logic if model is static shape optimized
             prev.index_copy_(dim, idx - 1, cur)
             return prev
         else:
@@ -634,7 +638,9 @@ def pre_attn_forward(
             )
             position_ids = position_ids.unsqueeze(0)
 
-        query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
+        query_states, key_states = apply_customized_rope(
+            query_states, key_states, cos, sin, position_ids, self.training
+        )
 
         if use_cache:
             # reuse k, v, self_attention
@@ -649,14 +655,24 @@ def pre_attn_forward(
             else:
                 if past_key_value is None:
                     past_key = torch.zeros(
-                        key_states.shape, dtype=self.get_k_proj_weight_dtype(), device=key_states.device
+                        key_states.shape,
+                        dtype=self.get_k_proj_weight_dtype()
+                        if self.get_k_proj_weight_dtype() != torch.uint8
+                        else key_states.dtype,
+                        device=key_states.device,
                     )
                     past_value = torch.zeros(
-                        key_states.shape, dtype=self.get_k_proj_weight_dtype(), device=key_states.device
+                        key_states.shape,
+                        dtype=self.get_k_proj_weight_dtype()
+                        if self.get_k_proj_weight_dtype() != torch.uint8
+                        else key_states.dtype,
+                        device=key_states.device,
                     )
+                    past_key.copy_(key_states)
+                    past_value.copy_(value_states)
                     # Return list instead of tuple
                     past_key_value = [past_key, past_value]
-                if (
+                elif (
                     token_idx is not None
                     and num_virtual_tokens is not None
                     and num_virtual_tokens == past_key_value[0].shape[-2]
@@ -743,6 +759,8 @@ def pre_attn_forward(
                 causal_mask = attention_mask
                 if cache_position is not None:
                     causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
+                else:
+                    causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
                 attn_weights = attn_weights + causal_mask
 
             if attn_softmax_bf16:
@@ -1043,7 +1061,7 @@ def forward(
             outputs += (self_attn_weights,)
         if use_cache:
             outputs += (present_key_value,)
-        # Store the residual spits to add them in the beginning of the next layer
+        # Store the residual splits to add them in the beginning of the next layer
         if attn_batch_split > 1 and past_key_value is None:
             outputs += (int_residual_splits,)
 
diff --git a/optimum/habana/transformers/models/llava/modeling_llava.py b/optimum/habana/transformers/models/llava/modeling_llava.py
index 997c16d700..a8d768b50b 100644
--- a/optimum/habana/transformers/models/llava/modeling_llava.py
+++ b/optimum/habana/transformers/models/llava/modeling_llava.py
@@ -197,7 +197,7 @@ def forward(
                 flash_attention_recompute=flash_attention_recompute,
             )
 
-            if input_ids.shape[1] != 1 and pixel_values is not None:
+            if input_ids.shape[1] != 1 and pixel_values is not None and tokens_pos is not None:
                 batch_size, seq_len = tokens_pos.shape
                 batch_indices = torch.arange(batch_size).repeat_interleave(seq_len)
                 logits = outputs[0][batch_indices, tokens_pos.reshape(-1), :].reshape(batch_size, seq_len, -1)
@@ -259,7 +259,10 @@ def prepare_inputs_for_generation(
         token_idx = kwargs.get("token_idx", None)
         image_offset = 0
         tokens_pos = None
-        if token_idx is not None and pixel_values is not None:
+        legacy_processing = (
+            (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+        ) or ((input_ids.shape[-1] == 1 if token_idx is None else token_idx == 1) and pixel_values is not None)
+        if token_idx is not None and pixel_values is not None and legacy_processing:
             input_ids, attention_mask, image_offset, tokens_pos = _pad_inputs(
                 input_ids,
                 attention_mask,
diff --git a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py b/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
index 6cf728d014..784828973e 100644
--- a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
+++ b/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
@@ -94,7 +94,7 @@ def forward(
                 flash_attention_recompute=flash_attention_recompute,
             )
 
-            if inputs_embeds.shape[1] != 1 and pixel_values is not None:
+            if inputs_embeds.shape[1] != 1 and pixel_values is not None and self.text_tokens_pos is not None:
                 batch_size, seq_len = self.text_tokens_pos.shape
                 batch_indices = torch.arange(batch_size).repeat_interleave(seq_len)
                 logits = outputs[0][batch_indices, self.text_tokens_pos.reshape(-1), :].reshape(
@@ -261,6 +261,9 @@ def prepare_inputs_for_generation(
                 **kwargs,
             )
         else:
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or ((input_ids.shape[-1] == 1 if token_idx is None else token_idx == 1) and pixel_values is not None)
             use_flash_attention = kwargs.get("use_flash_attention", False)
             flash_attention_recompute = kwargs.get("flash_attention_recompute", False)
             position_ids = kwargs.get("position_ids", None)
@@ -337,13 +340,28 @@ def prepare_inputs_for_generation(
                         image_feature = image_feature[0]
                         image_feature = torch.cat((image_feature, self.image_newline[None]), dim=0)
                     new_image_features.append(image_feature)
-                image_features = torch.stack(new_image_features, dim=0)
-                inputs_embeds, attention_mask, labels, position_ids, self.text_tokens_pos = (
-                    self._merge_input_ids_with_image_features(
-                        image_features, inputs_embeds, input_ids, attention_mask, labels
+                if legacy_processing:
+                    image_features = torch.stack(new_image_features, dim=0)
+                    inputs_embeds, attention_mask, labels, position_ids, self.text_tokens_pos = (
+                        self._merge_input_ids_with_image_features(
+                            image_features, inputs_embeds, input_ids, attention_mask, labels
+                        )
                     )
-                )
-                self.image_offset = image_features.shape[1] - 1  # image_token has occupied 1 token position.
+                    self.image_offset = image_features.shape[1] - 1  # image_token has occupied 1 token position.
+                else:
+                    image_features = torch.cat(new_image_features, dim=0)
+                    n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+                    n_image_features = image_features.shape[0]
+                    if n_image_tokens != n_image_features:
+                        raise ValueError(
+                            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                        )
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    batch_indices, image_indices = torch.where(input_ids == self.config.image_token_index)
+                    inputs_embeds[batch_indices, image_indices] = image_features.contiguous()
+                    self.image_offset = 0
+                    self.text_tokens_pos = None
+
                 if labels is None:
                     labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
 
@@ -353,33 +371,34 @@ def prepare_inputs_for_generation(
                 seq_len = input_ids.shape[1]
                 pad_len = seq_len - token_idx
                 input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                if legacy_processing:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
 
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
 
-                # Get the target length
-                past_length = first_layer_past_key_value.shape[-1]
+                    # Get the target length
+                    past_length = first_layer_past_key_value.shape[-1]
 
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = extended_attention_mask
-                attention_mask[:, -pad_len:] = 0
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                    attention_mask = extended_attention_mask
+                    attention_mask[:, -pad_len:] = 0
 
             if attention_mask is not None and position_ids is None:
                 # create position_ids on the fly for batch generation
diff --git a/optimum/habana/transformers/models/mixtral/__init__.py b/optimum/habana/transformers/models/mixtral/__init__.py
index 6225ac0906..65bdca2fbd 100644
--- a/optimum/habana/transformers/models/mixtral/__init__.py
+++ b/optimum/habana/transformers/models/mixtral/__init__.py
@@ -5,6 +5,7 @@
     GaudiMixtralForCausalLM,
     GaudiMixtralModel,
     gaudi_mixtral_block_dynamic_moe_forward,
+    gaudi_mixtral_block_moe_forward,
     gaudi_mixtral_block_sparse_moe_forward,
     gaudi_mixtral_rmsnorm_forward,
 )
diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
index 2f6ef4d093..97e9a8026f 100644
--- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
+++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
@@ -22,6 +22,7 @@
 
 import contextlib
 import math
+import os
 from typing import List, Optional, Tuple, Union
 
 import habana_frameworks.torch.core as htcore
@@ -46,6 +47,8 @@
 )
 from transformers.utils import is_torchdynamo_compiling, logging
 
+from optimum.habana.utils import get_device_name
+
 from ..llama.modeling_llama import (
     GaudiLlamaDynamicNTKScalingRotaryEmbedding,
     GaudiLlamaLinearScalingRotaryEmbedding,
@@ -165,8 +168,7 @@ def forward(q, k, v, mask, causal, q_block_size):
             s, e = i * q_block_size, (i + 1) * q_block_size
             row_q = q[:, :, s:e, :]
             row_mask = mask[:, :, s:e, :]
-            row_o = attn_output[:, :, s:e, :]
-            row_o.fill_(FusedSDPA.apply(row_q, k, v, row_mask, 0.0, causal, None))
+            attn_output[:, :, s:e, :] = FusedSDPA.apply(row_q, k, v, row_mask, 0.0, causal, None)
 
         if q_padding != 0:
             attn_output = attn_output[:, :, :-q_padding, :]
@@ -357,6 +359,15 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
+def gaudi_mixtral_block_moe_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    # We need this workaround until moe op in hpu is supporting fp8
+    if not self.training and not os.environ.get("QUANT_CONFIG") and not get_device_name() == "gaudi":
+        # Gaudi1 is not supporting dynamic moe
+        return self.dynamic_moe_forward(hidden_states)
+
+    return self.sparse_moe_forward(hidden_states)
+
+
 def gaudi_mixtral_block_sparse_moe_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Copied from MixtralSparseMoeBlock.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
diff --git a/optimum/habana/transformers/models/mllama/modeling_mllama.py b/optimum/habana/transformers/models/mllama/modeling_mllama.py
index 7e73868249..03b8ea16b3 100644
--- a/optimum/habana/transformers/models/mllama/modeling_mllama.py
+++ b/optimum/habana/transformers/models/mllama/modeling_mllama.py
@@ -694,6 +694,10 @@ def forward(
         next_decoder_cache = None if isinstance(past_key_values, Cache) else ()
 
         for idx, decoder_layer in enumerate(self.layers):
+            if not self.training and (
+                not torch.distributed.is_initialized() or torch.distributed.get_world_size() == 1
+            ):
+                htcore.mark_step()
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
diff --git a/optimum/habana/transformers/models/modeling_all_models.py b/optimum/habana/transformers/models/modeling_all_models.py
index 5a78359e3a..3f9304db74 100644
--- a/optimum/habana/transformers/models/modeling_all_models.py
+++ b/optimum/habana/transformers/models/modeling_all_models.py
@@ -48,9 +48,9 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             self.inp_seq_len = inp_seq_len
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     @staticmethod
diff --git a/optimum/habana/transformers/models/opt/modeling_opt.py b/optimum/habana/transformers/models/opt/modeling_opt.py
index dda2a6c204..a622eb2a7a 100644
--- a/optimum/habana/transformers/models/opt/modeling_opt.py
+++ b/optimum/habana/transformers/models/opt/modeling_opt.py
@@ -124,8 +124,7 @@ def gaudi_opt_attention_forward(
     if layer_head_mask is not None:
         if layer_head_mask.size() != (self.num_heads,):
             raise ValueError(
-                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                f" {layer_head_mask.size()}"
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
             )
         attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
         attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
diff --git a/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 721abfa8ff..11039f9636 100755
--- a/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/optimum/habana/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -189,9 +189,9 @@ def allocate(self, inp_seq_len, dtype, device, shape):
             self.inp_seq_len = inp_seq_len
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            assert self.inp_seq_len == inp_seq_len, (
+                f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            )
             self.cache.fill_(0)
 
     @staticmethod
diff --git a/optimum/habana/transformers/models/qwen2_vl/__init__.py b/optimum/habana/transformers/models/qwen2_vl/__init__.py
new file mode 100644
index 0000000000..72a587c799
--- /dev/null
+++ b/optimum/habana/transformers/models/qwen2_vl/__init__.py
@@ -0,0 +1,9 @@
+from .modeling_qwen2_vl import (
+    GaudiQwen2VisionSdpaAttention,
+    GaudiQwen2VisionTransformerPretrainedModel,
+    GaudiQwen2VLDecoderLayer,
+    GaudiQwen2VLForConditionalGeneration,
+    GaudiQwen2VLModel,
+    GaudiQwen2VLSdpaAttention,
+    GaudiQwen2VLVisionBlock,
+)
diff --git a/optimum/habana/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/optimum/habana/transformers/models/qwen2_vl/modeling_qwen2_vl.py
new file mode 100644
index 0000000000..d2f0706dd6
--- /dev/null
+++ b/optimum/habana/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -0,0 +1,755 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Gaudi Qwen2-VL model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+)
+from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+    Qwen2VisionTransformerPretrainedModel,
+    Qwen2VLCausalLMOutputWithPast,
+    Qwen2VLConfig,
+    Qwen2VLDecoderLayer,
+    Qwen2VLForConditionalGeneration,
+    Qwen2VLModel,
+    Qwen2VLSdpaAttention,
+    Qwen2VLVisionBlock,
+    VisionSdpaAttention,
+    _prepare_4d_causal_attention_mask_with_cache_position,
+    apply_multimodal_rotary_pos_emb,
+    apply_rotary_pos_emb_vision,
+    repeat_kv,
+)
+from transformers.utils import logging
+
+
+try:
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+except ImportError:
+    print("Not using HPU fused scaled dot-product attention kernel.")
+    FusedSDPA = None
+
+logger = logging.get_logger(__name__)
+
+
+class ModuleFusedSDPA(torch.nn.Module):
+    def __init__(self, fusedSDPA):
+        super().__init__()
+        self._hpu_kernel_fsdpa = fusedSDPA
+
+    def forward(self, query, key, value, attn_mask, dropout_p, is_casual, scale, softmax_mode):
+        return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_casual, scale, softmax_mode)
+
+
+# from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L383
+class GaudiQwen2VisionSdpaAttention(VisionSdpaAttention):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__(dim, num_heads)
+        self.fused_scaled_dot_product_attention = ModuleFusedSDPA(FusedSDPA) if FusedSDPA else None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+        use_flash_attention: Optional[bool] = False,
+    ) -> torch.Tensor:
+        """
+        Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L390
+        The only differences are:
+        - add new args use_flash_attention
+        - add FusedSDPA
+        """
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[:, cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+
+        if FusedSDPA is not None and use_flash_attention:
+            attn_output = self.fused_scaled_dot_product_attention(q, k, v, attention_mask, 0.0, False, None, "None")
+        else:
+            attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        del attention_mask
+        return attn_output
+
+
+# from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L418
+class GaudiQwen2VLVisionBlock(Qwen2VLVisionBlock):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__(config, attn_implementation)
+
+        self.attn = GaudiQwen2VisionSdpaAttention(config.embed_dim, num_heads=config.num_heads)
+
+    def forward(
+        self,
+        hidden_states,
+        cu_seqlens,
+        rotary_pos_emb,
+        use_flash_attention: Optional[bool] = False,
+    ) -> torch.Tensor:
+        """
+        Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L430
+        The only differences are:
+        - add new args use_flash_attention
+        """
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            use_flash_attention=use_flash_attention,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+# from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1058
+class GaudiQwen2VisionTransformerPretrainedModel(Qwen2VisionTransformerPretrainedModel):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        grid_thw: torch.Tensor,
+        use_flash_attention: Optional[bool] = False,
+    ) -> torch.Tensor:
+        """
+        Copied from https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1118
+        The only differences are:
+        - add new args use_flash_attention
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0, dtype=torch.int32
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                use_flash_attention=use_flash_attention,
+            )
+
+        return self.merger(hidden_states)
+
+
+# from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L821
+class GaudiQwen2VLSdpaAttention(Qwen2VLSdpaAttention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.fused_scaled_dot_product_attention = ModuleFusedSDPA(FusedSDPA) if FusedSDPA else None
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        use_flash_attention: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """
+        Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L829
+        The only differences are:
+        - add new args use_flash_attention
+        - add FusedSDPA
+        """
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2VLModel is using Qwen2VLSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        if FusedSDPA is not None and use_flash_attention:
+            attn_output = self.fused_scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                causal_mask,
+                self.attention_dropout if self.training else 0.0,
+                is_causal,
+                None,  # scale
+                "None",  #'fast'
+            )
+        else:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+# from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L930
+class GaudiQwen2VLDecoderLayer(Qwen2VLDecoderLayer):
+    def __init__(self, config: Qwen2VLConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.self_attn = GaudiQwen2VLSdpaAttention(config, layer_idx)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L946
+        The only differences are:
+        - add new kwargs use_flash_attention
+        """
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        use_flash_attention = kwargs.get("use_flash_attention", None)
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            use_flash_attention=use_flash_attention,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+# from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1137
+class GaudiQwen2VLModel(Qwen2VLModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_flash_attention: Optional[bool] = False,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Copied from Qwen2VLModel https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1161
+        The only differences are:
+        - add new arg use_flash_attention
+        - fixes graph recompilation due to torch.arange
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            # causes graph recompilations
+            # cache_position = torch.arange(
+            #    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            # )
+            cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    use_flash_attention=use_flash_attention,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+# from: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1420
+class GaudiQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        use_flash_attention: Optional[bool] = False,
+    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
+        """
+        Copied from Qwen2VLForConditionalGeneration https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1623
+        The only differences are:
+        - add new arg token_idx
+        - add new arg use_flash_attention
+        - add Gaudi Example
+        """
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor
+        >>> from optimum.habana.transformers.models import GaudiQwen2VLForConditionalGeneration
+        >>> from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+        >>> from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+        >>> adapt_transformers_to_gaudi()
+        >>> model = GaudiQwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+        >>> model = model.to("hpu")
+        >>> wrap_in_hpu_graph(model)
+        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], return_tensors="pt")
+        >>> inputs = inputs.to("hpu")
+        >>> generate_kwargs = {
+                "lazy_mode": True,
+                "hpu_graphs": True,
+                "static_shapes": True,
+                "use_cache": True,
+                "cache_implementation": "static",
+                "use_flash_attention": True
+            }
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=30, **generate_kwargs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene in what appears to be a Chinatown area. The focal point is a red stop sign on the left side of the..."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.visual.get_dtype())
+                image_embeds = self.visual(
+                    pixel_values, grid_thw=image_grid_thw, use_flash_attention=use_flash_attention
+                )
+                image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+            if pixel_values_videos is not None:
+                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_flash_attention=use_flash_attention,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Qwen2VLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=rope_deltas,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        """
+        Copied from https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1748
+        The only differences are:
+        - handle new args token_idx
+        - handle new args use_flash_attention
+        """
+        token_idx = kwargs.get("token_idx", None)
+        use_flash_attention = kwargs.get("use_flash_attention", False)
+        if token_idx is not None:
+            if isinstance(past_key_values, StaticCache):
+                if cache_position.shape[0] > 1:
+                    input_ids = input_ids[:, :token_idx]
+                    attention_mask = attention_mask[:, :token_idx]
+                    cache_position = cache_position[:token_idx]
+                else:
+                    # over-write with token idx
+                    cache_position[0] = token_idx - 1
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        rope_deltas = kwargs.get("rope_deltas", None)
+        if attention_mask is not None and position_ids is None:
+            if cache_position is None or (cache_position is not None and cache_position[0] == 0):
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids, image_grid_thw, video_grid_thw, attention_mask
+                )
+            else:
+                batch_size, seq_length = input_ids.shape
+                delta = (
+                    cache_position[0] + rope_deltas if cache_position is not None and rope_deltas is not None else 0
+                )
+                position_ids = torch.arange(seq_length, device=input_ids.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        if cache_position[0] != 0:
+            pixel_values = None
+            pixel_values_videos = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = inputs_embeds.shape
+                device = inputs_embeds.device
+            else:
+                batch_size, sequence_length = input_ids.shape
+                device = input_ids.device
+
+            dtype = self.lm_head.weight.dtype
+            min_dtype = torch.finfo(dtype).min
+
+            attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_length(),
+                dtype=dtype,
+                device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "pixel_values_videos": pixel_values_videos,
+                "image_grid_thw": image_grid_thw,
+                "video_grid_thw": video_grid_thw,
+                "rope_deltas": rope_deltas,
+                "token_idx": token_idx,
+                "use_flash_attention": use_flash_attention,
+            }
+        )
+
+        return model_inputs
diff --git a/optimum/habana/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/optimum/habana/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index 53cea37255..061aebb3c6 100644
--- a/optimum/habana/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/optimum/habana/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -732,7 +732,7 @@ def gaudi_SeamlessM4TForTextToSpeech_generate(
             elif tgt_lang not in lang_code_to_id:
                 raise ValueError(
                     f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
+                Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
                 more languages for text translation than for speech synthesis."""
                 )
     if kwargs.get("hpu_graphs", True):
diff --git a/optimum/habana/transformers/models/speecht5/modeling_speecht5.py b/optimum/habana/transformers/models/speecht5/modeling_speecht5.py
index 07c4fa8a14..8ce05607a2 100644
--- a/optimum/habana/transformers/models/speecht5/modeling_speecht5.py
+++ b/optimum/habana/transformers/models/speecht5/modeling_speecht5.py
@@ -114,8 +114,7 @@ def gaudi_SpeechT5Attention_forward(
     if layer_head_mask is not None:
         if layer_head_mask.size() != (self.num_heads,):
             raise ValueError(
-                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                f" {layer_head_mask.size()}"
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
             )
         attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
         attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
diff --git a/optimum/habana/transformers/models/t5/modeling_t5.py b/optimum/habana/transformers/models/t5/modeling_t5.py
index b7d7f9957e..8927cf3787 100644
--- a/optimum/habana/transformers/models/t5/modeling_t5.py
+++ b/optimum/habana/transformers/models/t5/modeling_t5.py
@@ -69,7 +69,7 @@ def gaudi_T5Attention_forward(
     if past_key_value is not None:
         if len(past_key_value) != 2:
             raise ValueError(
-                f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+                f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
             )
         if token_idx is None:
             real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
@@ -406,7 +406,7 @@ def gaudi_T5Stack_forward(
 
         if self.gradient_checkpointing and self.training:
             layer_outputs = self._gradient_checkpointing_func(
-                layer_module.forward,
+                layer_module.__call__,
                 hidden_states,
                 extended_attention_mask,
                 position_bias,
diff --git a/optimum/habana/transformers/models/video_llava/__init__.py b/optimum/habana/transformers/models/video_llava/__init__.py
new file mode 100644
index 0000000000..57831502dc
--- /dev/null
+++ b/optimum/habana/transformers/models/video_llava/__init__.py
@@ -0,0 +1 @@
+from .modeling_video_llava import GaudiVideoLlavaForConditionalGeneration
diff --git a/optimum/habana/transformers/models/video_llava/modeling_video_llava.py b/optimum/habana/transformers/models/video_llava/modeling_video_llava.py
new file mode 100644
index 0000000000..2ba890c8d5
--- /dev/null
+++ b/optimum/habana/transformers/models/video_llava/modeling_video_llava.py
@@ -0,0 +1,411 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VideoLlava model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers.models.video_llava.modeling_video_llava import (
+    VideoLlavaCausalLMOutputWithPast,
+    VideoLlavaConfig,
+    VideoLlavaForConditionalGeneration,
+)
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GaudiVideoLlavaForConditionalGeneration(VideoLlavaForConditionalGeneration):
+    def __init__(self, config: VideoLlavaConfig):
+        super().__init__(config)
+        self.feature_offset = 0
+
+    def _merge_input_ids_with_visual_features(
+        self, visual_features, inputs_embeds, input_ids, attention_mask, labels, token_idx, num_frames=1
+    ):
+        r"""
+        Copied from VideoLlavaForConditionalGeneration._merge_input_ids_with_visual_features: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/video_llava/modeling_video_llava.py
+        The only differences are:
+        - add new args token_idx
+        - add self.feature_offset param
+        """
+        num_images, num_image_patches, embed_dim = visual_features.shape
+        batch_size, sequence_length = input_ids.shape
+        last_token_idx = token_idx + self.feature_offset
+        left_padding = not torch.sum(input_ids[:, last_token_idx - 1] == torch.tensor(self.pad_token_id))
+        special_vision_token = self.config.video_token_index if num_frames > 1 else self.config.image_token_index
+
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == special_vision_token
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_seq_len = (num_special_image_tokens.max() * (num_image_patches * num_frames - 1)) + sequence_length
+        self.feature_offset = self.feature_offset + max_seq_len - sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != special_vision_token)
+
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = (
+            torch.cumsum((special_image_token_mask * (num_image_patches * num_frames - 1) + 1), dim=-1) - 1
+        )
+        nb_image_pad = max_seq_len - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+        # 3. Create the full embedding, already padded to the maximum position
+        # expand input ids so that the second "merge" with videos does not fail
+        final_embedding = torch.zeros(
+            batch_size, max_seq_len, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_seq_len, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        final_input_ids = torch.full(
+            (batch_size, max_seq_len), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device
+        )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_seq_len), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        else:
+            final_labels = None
+
+        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
+        image_to_overwrite = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=inputs_embeds.device)
+        image_to_overwrite[batch_indices, text_to_overwrite] = False
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+
+        if image_to_overwrite.sum() != visual_features.shape[:-1].numel():
+            visual_type = "videos" if num_frames == 8 else "images"
+            num_images //= num_frames
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of {visual_type} tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of {visual_type} given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+
+        final_embedding[image_to_overwrite] = visual_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+
+        return final_embedding, final_attention_mask, final_labels, position_ids, final_input_ids
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values_images: torch.FloatTensor = None,
+        pixel_values_videos: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        token_idx: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, VideoLlavaCausalLMOutputWithPast]:
+        r"""
+        Copied from VideoLlavaForConditionalGeneration.forward: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/video_llava/modeling_video_llava.py
+        The only differences are:
+        - add new args token_idx
+        - add new args attn_softmax_bf16
+        - add new args reuse_cache
+        - add new args use_flash_attention
+        - add new args flash_attention_recompute
+        - add new args flash_attention_causal_mask
+        - add new args flash_attention_fast_softmax
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=0,
+            token_idx=token_idx,
+            trim_logits=kwargs.get("trim_logits"),
+            attn_softmax_bf16=kwargs.get("attn_softmax_bf16"),
+            reuse_cache=kwargs.get("reuse_cache"),
+            use_flash_attention=kwargs.get("use_flash_attention"),
+            flash_attention_recompute=kwargs.get("flash_attention_recompute"),
+            flash_attention_causal_mask=kwargs.get("flash_attention_causal_mask"),
+            flash_attention_fast_softmax=kwargs.get("flash_attention_fast_softmax"),
+            valid_sequence_lengths=kwargs.get("valid_sequence_lengths"),
+            cache_idx=kwargs.get("cache_idx"),
+            lazy_mode=kwargs.get("lazy_mode"),
+            num_virtual_tokens=kwargs.get("num_virtual_tokens"),
+        )
+
+        logits = outputs[0]
+        if logits.shape[1] > 1:
+            logits = logits[:, self.feature_offset :, :]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return VideoLlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=kwargs.get("image_features", None) if pixel_values_images is not None else None,
+            video_hidden_states=kwargs.get("video_features", None) if pixel_values_videos is not None else None,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values_images=None,
+        pixel_values_videos=None,
+        attention_mask=None,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        token_idx = kwargs.get("token_idx", None)
+        if token_idx is None:
+            return super().prepare_inputs_for_generation(
+                input_ids=input_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                pixel_values_images=pixel_values_images,
+                pixel_values_videos=pixel_values_videos,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                num_logits_to_keep=num_logits_to_keep,
+                **kwargs,
+            )
+        # Else, we need to update token_idx when merging features from videos/images with input embeddings
+        labels = kwargs.get("labels", None)
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        legacy_processing = False
+        inputs_not_expanded = False
+        if input_ids is not None:
+            img_token_not_enough = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+            video_token_not_enough = (input_ids == self.config.video_token_index).sum(
+                1
+            ).max() < self.config.video_seq_length
+            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            inputs_not_expanded = (img_token_not_enough and pixel_values_images is not None) or (
+                video_token_not_enough and pixel_values_videos is not None
+            )
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+        position_ids = model_inputs["position_ids"]
+        cache_position = model_inputs["cache_position"]
+        attention_mask = model_inputs["attention_mask"]
+        inputs_embeds = model_inputs.get("inputs_embeds", None)
+        input_ids = model_inputs.get("input_ids", None)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            pixels_present = input_ids.shape[-1] == 1 and (
+                pixel_values_images is not None or pixel_values_videos is not None
+            )
+            legacy_processing = inputs_not_expanded or pixels_present
+
+        vision_feature_layer = kwargs.get("vision_feature_layer", None)
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = kwargs.get("vision_feature_select_strategy", None)
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        if pixel_values_images is not None or pixel_values_videos is not None:
+            image_outputs, video_outputs, num_frames = self._get_vision_features(
+                pixel_values_images=pixel_values_images,
+                pixel_values_videos=pixel_values_videos,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+
+            image_features = video_features = None
+            if image_outputs is not None:
+                image_features = self.multi_modal_projector(image_outputs)
+            if video_outputs is not None:
+                video_features = self.multi_modal_projector(video_outputs)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+                if input_ids.shape[1] != 1:
+                    self.feature_offset = 0
+                    for features, frames in ((image_features, 1), (video_features, num_frames)):
+                        if features is not None:
+                            (
+                                inputs_embeds,
+                                attention_mask,
+                                labels,
+                                position_ids,
+                                input_ids,
+                            ) = self._merge_input_ids_with_visual_features(
+                                features,
+                                inputs_embeds,
+                                input_ids,
+                                attention_mask,
+                                labels,
+                                token_idx,
+                                num_frames=frames,
+                            )
+                    cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    new_token_idx = token_idx + self.feature_offset
+                    extended_attention_mask[:, new_token_idx - 1 + target_length :] = 0
+                    attention_mask = extended_attention_mask.clone()
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                    cache_position = new_token_idx
+
+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                if image_outputs is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+                if video_outputs is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "attention_mask": attention_mask,
+                "token_idx": token_idx + self.feature_offset,
+                "inputs_embeds": inputs_embeds,
+            }
+        )
+        if legacy_processing or cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values_images"] = pixel_values_images
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+            model_inputs["image_features"] = image_features
+            model_inputs["video_features"] = video_features
+        return model_inputs
diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
index 0e1378ee57..e2fdc3f2e6 100644
--- a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import os
+import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -68,7 +69,7 @@ def _gaudi_wav2vec2_compute_mask_indices(
         )
 
     # epsilon is used for probabilistic rounding
-    epsilon = torch.rand([], device="hpu")
+    epsilon = torch.rand(1).item()
 
     def compute_num_masked_span(input_length):
         """Given input length, compute how many spans should be masked"""
@@ -106,19 +107,9 @@ def compute_num_masked_span(input_length):
         num_masked_span = compute_num_masked_span(input_length)
 
         # get random indices to mask
-        """
-        Original code:
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        spec_aug_mask_idx = torch.tensor(
+            random.sample(range(input_length - (mask_length - 1)), num_masked_span), dtype=torch.int32
         )
-        When (input_length - (mask_length - 1) < 0), then num_masked_span=0
-        and we get: spec_aug_mask_idx=array([], dtype=int64)
-        However torch rewrite fails, because torch.randperm expects positive number
-        This causes a unit test to fail:
-        RUN_SLOW=true  GAUDI2_CI=1 python -m pytest tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py -v -s -k test_compute_mask_indices_short_audio
-        """
-        spec_aug_mask_idx = torch.randperm(input_length - (mask_length - 1), device="hpu")[:num_masked_span]
-
         # pick first sampled index that will serve as a dummy index to pad vector
         # to ensure same dimension for all batches due to probabilistic rounding
         # Picking first sample just pads those vectors twice.
@@ -133,13 +124,12 @@ def compute_num_masked_span(input_length):
         spec_aug_mask_idx = torch.cat(
             [
                 spec_aug_mask_idx,
-                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32, device="hpu") * dummy_mask_idx,
+                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32) * dummy_mask_idx,
             ]
         )
         spec_aug_mask_idxs.append(spec_aug_mask_idx.to(dtype=torch.long))
 
-    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs)
-
+    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs).to("hpu")
     # expand masked indices to masked spans
     spec_aug_mask_idxs = torch.broadcast_to(
         spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
@@ -248,7 +238,7 @@ def gaudi_wav2vec2_encoder_forward(
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        dropout_probability = torch.rand([], device="hpu")
+        dropout_probability = torch.rand([])
 
         skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
         if not skip_the_layer or deepspeed_zero3_is_enabled:
@@ -507,7 +497,6 @@ def __init__(
             is_causal,
             config,
         )
-        self.use_flash_attention = True if os.getenv("USE_FLASH_ATTENTION") == "1" else False
         self.flash_attention_fast_softmax = True if os.getenv("FLASH_ATTENTION_FAST_SOFTMAX") == "1" else False
         self.flash_attention_recompute = True if os.getenv("FLASH_ATTENTION_RECOMPUTE") == "1" else False
 
@@ -592,7 +581,7 @@ def forward(
         # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
         is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
 
-        if self.use_flash_attention and FusedSDPA:
+        if FusedSDPA:
             if tgt_len == 1:
                 # next token
                 softmax_mode = True if os.getenv("QUANT_CONFIG", "") else False
diff --git a/optimum/habana/transformers/models/xglm/modeling_xglm.py b/optimum/habana/transformers/models/xglm/modeling_xglm.py
index ef5a16801a..f69eb3b990 100644
--- a/optimum/habana/transformers/models/xglm/modeling_xglm.py
+++ b/optimum/habana/transformers/models/xglm/modeling_xglm.py
@@ -109,8 +109,7 @@ def gaudi_xglm_attention_forward(
     if layer_head_mask is not None:
         if layer_head_mask.size() != (self.num_heads,):
             raise ValueError(
-                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                f" {layer_head_mask.size()}"
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
             )
         attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
         attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
@@ -300,7 +299,7 @@ def gaudi_xglm_model_forward(
     if self.gradient_checkpointing and self.training:
         if use_cache:
             logger.warning_once(
-                "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache =" " False`..."
+                "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..."
             )
             use_cache = False
 
diff --git a/optimum/habana/transformers/models/xlm_roberta/__init__.py b/optimum/habana/transformers/models/xlm_roberta/__init__.py
new file mode 100644
index 0000000000..c411ff07e2
--- /dev/null
+++ b/optimum/habana/transformers/models/xlm_roberta/__init__.py
@@ -0,0 +1 @@
+from .modeling_xlm_roberta import gaudi_XLMRoberta_Sdpa_SelfAttention_forward
diff --git a/optimum/habana/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/optimum/habana/transformers/models/xlm_roberta/modeling_xlm_roberta.py
new file mode 100644
index 0000000000..2d6d5532d5
--- /dev/null
+++ b/optimum/habana/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -0,0 +1,102 @@
+# coding=utf-8
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-RoBERTa model."""
+
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from habana_frameworks.torch.hpex.kernels import FusedSDPA
+
+from optimum.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_XLMRoberta_Sdpa_SelfAttention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+    output_attentions: Optional[bool] = False,
+) -> Tuple[torch.Tensor]:
+    r"""
+    Copied from https://github.com/huggingface/transformers/blob/v4.46.3/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py#L295
+    Changes:
+        - Use HPU's FusedSDPA(fast mode for softmax) to replace `orch.nn.functional.scaled_dot_product_attention`
+    """
+    if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
+        # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
+        logger.warning_once(
+            "XLMRobertaSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+            "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
+            "the manual attention implementation, but specifying the manual implementation will be required from "
+            "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+            '`attn_implementation="eager"` when loading the model.'
+        )
+        return super().forward(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+
+    bsz, tgt_len, _ = hidden_states.size()
+
+    query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+    # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention
+    # mask needs to be such that the encoder's padding tokens are not attended to.
+    is_cross_attention = encoder_hidden_states is not None
+
+    current_states = encoder_hidden_states if is_cross_attention else hidden_states
+    attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+
+    # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning
+    if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+        key_layer, value_layer = past_key_value
+    else:
+        key_layer = self.transpose_for_scores(self.key(current_states))
+        value_layer = self.transpose_for_scores(self.value(current_states))
+        if past_key_value is not None and not is_cross_attention:
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+
+    if self.is_decoder:
+        past_key_value = (key_layer, value_layer)
+
+    is_causal = (
+        True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False
+    )
+
+    attn_output = FusedSDPA.apply(
+        query_layer, key_layer, value_layer, attention_mask, 0.0, is_causal, None, "fast", False
+    )
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+
+    outputs = (attn_output,)
+    if self.is_decoder:
+        outputs = outputs + (past_key_value,)
+    return outputs
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
index ec7d31e3a6..0880e316fa 100644
--- a/optimum/habana/transformers/trainer.py
+++ b/optimum/habana/transformers/trainer.py
@@ -158,7 +158,7 @@ def _get_input_update_settings(model, lazy_mode: Optional[bool] = None) -> Tuple
     inputs_update: Dict = {}
 
     should_update_inputs = (getattr(model, "generation_config", None) is not None) and (
-        model.config.model_type in ("llama", "qwen2", "starcoder2", "gemma", "baichuan", "chatglm")
+        model.config.model_type in ("llama", "qwen2", "starcoder2", "gemma", "baichuan", "chatglm", "deepseek_v2")
     )
     if should_update_inputs:
         if model.generation_config.attn_softmax_bf16:
@@ -1586,9 +1586,9 @@ def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Te
             self.htcore.mark_step()
 
         if _is_peft_model(self.model) and self.model.peft_type == PeftType.ADALORA:
-            assert not (
-                self.accelerator.state.is_fp8_enabled and self.args.gradient_checkpointing
-            ), "FP8 precision with gradient_checkpointing is currently not supported with PeftType.ADALORA"
+            assert not (self.accelerator.state.is_fp8_enabled and self.args.gradient_checkpointing), (
+                "FP8 precision with gradient_checkpointing is currently not supported with PeftType.ADALORA"
+            )
             if self.is_deepspeed_enabled and not is_deepspeed_zero3_enabled():
                 self.accelerator.deepspeed_engine_wrapped.engine.backward(loss)
                 self.model.base_model.update_and_allocate(self.state.global_step)
@@ -2463,6 +2463,7 @@ def create_accelerator_and_postprocess(self):
             "distribution_strategy": self.args.distribution_strategy,
             "dynamic": self.args.compile_dynamic,
             "dataloader_config": dataloader_config,
+            "use_regional_compilation": self.args.use_regional_compilation,
         }
 
         # create accelerator object
diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py
index 4a2b12593f..098db2906c 100644
--- a/optimum/habana/transformers/training_args.py
+++ b/optimum/habana/transformers/training_args.py
@@ -101,6 +101,12 @@ class GaudiTrainingArguments(TrainingArguments):
             Whether to use compiled autograd for training. Currently only for summarization models.
         compile_dynamic (`bool|None`, *optional*, defaults to `None`):
             Set value of 'dynamic' parameter for torch.compile.
+        use_regional_compilation (`bool`, *optional*, defaults to `False`):
+            Whether to use regional compile with deepspeed
+        inline_inbuilt_nn_modules (`bool`, *optional*, defaults to `None`):
+            Set value of 'inline_inbuilt_nn_modules' parameter for torch._dynamo.config. Currently, disabling this parameter improves the performance of the ALBERT model.
+        cache_size_limit(`int`, *optional*, defaults to 'None'):
+            Set value of 'cache_size_limit' parameter for torch._dynamo.config
         disable_tensor_cache_hpu_graphs (`bool`, *optional*, defaults to `False`):
             Whether to disable tensor cache when using hpu graphs. If True, tensors won't be cached in hpu graph and memory can be saved.
         max_hpu_graphs (`int`, *optional*):
@@ -170,6 +176,21 @@ class GaudiTrainingArguments(TrainingArguments):
         metadata={"help": ("Set value of 'dynamic' parameter for torch.compile.")},
     )
 
+    cache_size_limit: Optional[int] = field(
+        default=None,
+        metadata={"help": "Set value of 'cache_size_limit' parameter for torch._dynamo.config."},
+    )
+
+    use_regional_compilation: Optional[bool] = field(
+        default=False,
+        metadata={"help": ("Whether to use regional compile for traing.")},
+    )
+
+    inline_inbuilt_nn_modules: Optional[bool] = field(
+        default=None,
+        metadata={"help": ("Set value of 'inline_inbuilt_nn_modules' parameter for torch._dynamo.config.")},
+    )
+
     disable_tensor_cache_hpu_graphs: Optional[bool] = field(
         default=False,
         metadata={"help": "Whether to use a tensor cache for hpu graphs."},
@@ -195,6 +216,11 @@ class GaudiTrainingArguments(TrainingArguments):
         metadata={"help": ("Determines how many ranks are divided into context parallel group.")},
     )
 
+    minimize_memory: Optional[bool] = field(
+        default=False,
+        metadata={"help": ("Whether to enable minimze memory for fp8")},
+    )
+
     throughput_warmup_steps: Optional[int] = field(
         default=0,
         metadata={
@@ -860,6 +886,12 @@ def _setup_devices(self) -> "torch.device":
         if self.sdp_on_bf16:
             torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
+        if self.inline_inbuilt_nn_modules is not None:
+            torch._dynamo.config.inline_inbuilt_nn_modules = self.inline_inbuilt_nn_modules
+
+        if self.torch_compile and self.cache_size_limit is not None:
+            torch._dynamo.config.cache_size_limit = self.cache_size_limit
+
         logger.info("PyTorch: setting up devices")
         if not is_accelerate_available():
             raise ImportError(
@@ -927,6 +959,7 @@ def _setup_devices(self) -> "torch.device":
                 accelerator_state_kwargs["backend"] = self.ddp_backend
                 accelerator_state_kwargs["timeout"] = timedelta(seconds=self.ddp_timeout)
             accelerator_state_kwargs["context_parallel_size"] = self.context_parallel_size
+            accelerator_state_kwargs["minimize_memory"] = self.minimize_memory
         else:
             raise ValueError(
                 "No device has been set. Use either --use_habana to run on HPU or --no_cuda to run on CPU."
diff --git a/optimum/habana/trl/models/modeling_sd_base.py b/optimum/habana/trl/models/modeling_sd_base.py
index 006f3d77c8..2bcd1ff415 100644
--- a/optimum/habana/trl/models/modeling_sd_base.py
+++ b/optimum/habana/trl/models/modeling_sd_base.py
@@ -323,6 +323,7 @@ def __init__(
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,
         bf16_full_eval: bool = False,
+        sdp_on_bf16: bool = False,
     ):
         """
         Adapted from: https://github.com/huggingface/trl/blob/v0.7.8/trl/models/modeling_sd_base.py#L531
@@ -337,6 +338,7 @@ def __init__(
             use_hpu_graphs=use_hpu_graphs,
             gaudi_config=gaudi_config,
             torch_dtype=torch.bfloat16 if bf16_full_eval else torch.float32,
+            sdp_on_bf16=sdp_on_bf16,
         )
 
         self.use_lora = use_lora
diff --git a/optimum/habana/trl/trainer/dpo_trainer.py b/optimum/habana/trl/trainer/dpo_trainer.py
index bd07a981bb..84c48f1782 100644
--- a/optimum/habana/trl/trainer/dpo_trainer.py
+++ b/optimum/habana/trl/trainer/dpo_trainer.py
@@ -167,8 +167,7 @@ def __init__(
 
         if isinstance(ref_model, str):
             warnings.warn(
-                "You passed a ref model_id to the DPOTrainer. This will automatically create an "
-                "`AutoModelForCausalLM`"
+                "You passed a ref model_id to the DPOTrainer. This will automatically create an `AutoModelForCausalLM`"
             )
             ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs)
 
diff --git a/optimum/habana/trl/trainer/sft_trainer.py b/optimum/habana/trl/trainer/sft_trainer.py
index 04e648a161..6fb6365655 100644
--- a/optimum/habana/trl/trainer/sft_trainer.py
+++ b/optimum/habana/trl/trainer/sft_trainer.py
@@ -133,9 +133,9 @@ def __init__(
         - num_buckets: Number of buckets. > 0 means apply bucketing, <= 0  means no bucketing
         """
         if num_buckets > 0:
-            assert (
-                data_collator is None
-            ), "For bucketing (num_buckets > 0), we only support data_collator=None (later it becomes DataCollatorForLanguageModeling)"
+            assert data_collator is None, (
+                "For bucketing (num_buckets > 0), we only support data_collator=None (later it becomes DataCollatorForLanguageModeling)"
+            )
         if args is None:
             output_dir = "tmp_trainer"
             warnings.warn(f"No `SFTConfig` passed, using `output_dir={output_dir}`.")
diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py
index a3aa728e81..244b52e203 100755
--- a/optimum/habana/utils.py
+++ b/optimum/habana/utils.py
@@ -31,7 +31,7 @@
 logger = logging.get_logger(__name__)
 
 
-CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.18.0")
+CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.19.0")
 
 
 def to_device_dtype(my_input: Any, target_device: torch.device = None, target_dtype: torch.dtype = None):
@@ -387,7 +387,7 @@ def check_habana_frameworks_version(req_version):
 
 def get_device_name():
     """
-    Returns the name of the current device: Gaudi or Gaudi2.
+    Returns the name of the current device: Gaudi, Gaudi2 or Gaudi3.
 
     Inspired from: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
     """
@@ -399,5 +399,19 @@ def get_device_name():
         return "gaudi"
     elif device_type == htexp.synDeviceType.synDeviceGaudi2:
         return "gaudi2"
+    elif device_type == htexp.synDeviceType.synDeviceGaudi3:
+        return "gaudi3"
     else:
         raise ValueError(f"Unsupported device: the device type is {device_type}.")
+
+
+def get_device_count():
+    """
+    Returns the number of the current gaudi devices
+    """
+    import habana_frameworks.torch.utils.experimental as htexp
+
+    if htexp.hpu.is_available():
+        return htexp.hpu.device_count()
+    else:
+        raise ValueError("No hpu is found avail on this system")
diff --git a/optimum/habana/version.py b/optimum/habana/version.py
index 56cc966b00..2610736245 100644
--- a/optimum/habana/version.py
+++ b/optimum/habana/version.py
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.15.0.dev0"
+__version__ = "1.16.0.dev0"
diff --git a/pyproject.toml b/pyproject.toml
index f53b25d1c0..b7896da5e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,12 +41,3 @@ skip-magic-trailing-comma = false
 
 # Like Black, automatically detect the appropriate line ending.
 line-ending = "auto"
-
-[tool.pytest.ini_options]
-addopts = "--doctest-glob='**/*.md'"
-doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
-markers = [
-    "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
-    "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
-    "generate: marks tests that use the GenerationTesterMixin"
-]
diff --git a/setup.py b/setup.py
index 0bb36466ee..2fc42f7711 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@
     "accelerate >= 0.33.0, < 0.34.0",
     "diffusers >= 0.31.0, < 0.32.0",
     "huggingface_hub >= 0.24.7",
-    "sentence-transformers == 3.2.1",
+    "sentence-transformers == 3.3.1",
 ]
 
 TESTS_REQUIRE = [
diff --git a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json b/tests/baselines/Llama_3_2_11B_Vision_Instruct.json
index 3789c63fa9..fd90ab97f0 100644
--- a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json
+++ b/tests/baselines/Llama_3_2_11B_Vision_Instruct.json
@@ -1,14 +1,14 @@
 {
     "gaudi2": {
         "image2text_lora_finetune": {
-            "num_train_epochs": 2,
+            "num_train_epochs": 1,
             "eval_batch_size": 4,
             "distribution": {
                 "multi_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 2,
-                    "train_runtime": 470,
-                    "train_samples_per_second": 22,
+                    "train_runtime": 350,
+                    "train_samples_per_second": 20.48,
                     "eval_accuracy": 0.6,
                     "extra_arguments": [
                         "--bf16",
diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/baselines/bert_large_uncased_whole_word_masking.json
index 37948b9746..605e719faf 100755
--- a/tests/baselines/bert_large_uncased_whole_word_masking.json
+++ b/tests/baselines/bert_large_uncased_whole_word_masking.json
@@ -77,7 +77,7 @@
                     "learning_rate": 3e-5,
                     "train_batch_size": 32,
                     "eval_f1": 91.71,
-                    "train_runtime": 77.307,
+                    "train_runtime": 80.307,
                     "train_samples_per_second": 2150.333,
                     "extra_arguments": [
                         "--max_seq_length 384",
@@ -95,7 +95,7 @@
                     "train_batch_size": 256,
                     "eval_f1": 0.867,
                     "train_runtime": 33.2909,
-                    "train_samples_per_second": 1151.598,
+                    "train_samples_per_second": 1100.598,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--use_hpu_graphs_for_inference"
@@ -115,4 +115,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/baselines/falcon_40b.json b/tests/baselines/falcon_40b.json
index 4a91bf9a7a..ab040192c6 100644
--- a/tests/baselines/falcon_40b.json
+++ b/tests/baselines/falcon_40b.json
@@ -1,14 +1,14 @@
 {
     "gaudi2": {
         "timdettmers/openassistant-guanaco": {
-            "num_train_epochs": 3,
+            "num_train_epochs": 1,
             "eval_batch_size": 1,
             "distribution": {
                 "multi_card": {
                     "learning_rate": 4e-4,
                     "train_batch_size": 1,
                     "perplexity": 4.0893,
-                    "train_runtime": 931.1213,
+                    "train_runtime": 360,
                     "train_samples_per_second": 28.162,
                     "extra_arguments": [
                         "--bf16",
@@ -36,14 +36,14 @@
             }
         },
         "mamamiya405/finred": {
-            "num_train_epochs": 3,
+            "num_train_epochs": 1,
             "eval_batch_size": 1,
             "distribution": {
                 "multi_card": {
                     "learning_rate": 4e-4,
                     "train_batch_size": 1,
                     "perplexity": 4.0893,
-                    "train_runtime": 1170,
+                    "train_runtime": 470,
                     "train_samples_per_second": 28.162,
                     "extra_arguments": [
                         "--bf16",
diff --git a/tests/baselines/gpt_neox_20b.json b/tests/baselines/gpt_neox_20b.json
index 103bb18efe..cb8664a1db 100644
--- a/tests/baselines/gpt_neox_20b.json
+++ b/tests/baselines/gpt_neox_20b.json
@@ -1,14 +1,14 @@
 {
     "gaudi2": {
         "wikitext": {
-            "num_train_epochs": 2,
+            "num_train_epochs": 1,
             "eval_batch_size": 2,
             "distribution": {
                 "deepspeed": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 2,
                     "perplexity": 8.169664686471043,
-                    "train_runtime": 781.7156,
+                    "train_runtime": 445,
                     "train_samples_per_second": 7.328,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
diff --git a/tests/baselines/roberta_large.json b/tests/baselines/roberta_large.json
index 8b9037b32b..4d7233e089 100755
--- a/tests/baselines/roberta_large.json
+++ b/tests/baselines/roberta_large.json
@@ -67,7 +67,7 @@
                     "learning_rate": 7e-5,
                     "train_batch_size": 32,
                     "eval_f1": 94.09,
-                    "train_runtime": 77.333,
+                    "train_runtime": 79.333,
                     "train_samples_per_second": 2138.366,
                     "extra_arguments": [
                         "--max_seq_length 384",
@@ -95,4 +95,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json
index d1a563c9ff..055d321152 100644
--- a/tests/baselines/whisper_small.json
+++ b/tests/baselines/whisper_small.json
@@ -41,10 +41,10 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 32,
-                    "eval_wer": 0.3806988352745424,
-                    "train_runtime": 312.5894,
-                    "train_samples_per_second": 280.111,
-                    "eval_samples_per_second": 19.073,
+                    "eval_wer": 0.4693843594009983,
+                    "train_runtime": 380.00,
+                    "train_samples_per_second": 218.0,
+                    "eval_samples_per_second": 31.0,
                     "extra_arguments": [
                         "--dataset_config_name hi",
                         "--language hindi",
diff --git a/tests/ci/slow_tests_diffusers.sh b/tests/ci/slow_tests_diffusers.sh
index ab776092a5..3c95a67d0d 100644
--- a/tests/ci/slow_tests_diffusers.sh
+++ b/tests/ci/slow_tests_diffusers.sh
@@ -2,6 +2,7 @@
 
 python -m pip install --upgrade pip
 export RUN_SLOW=true
+huggingface-cli login --token $1
 make test_installs
 CUSTOM_BF16_OPS=1 python -m pytest tests/test_diffusers.py -v -s -k "test_no_throughput_regression_autocast"
 make slow_tests_diffusers
diff --git a/tests/test_bnb_inference.py b/tests/test_bnb_inference.py
new file mode 100644
index 0000000000..9218869669
--- /dev/null
+++ b/tests/test_bnb_inference.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2022 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+import torch
+from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+from optimum.habana.transformers import modeling_utils
+
+
+modeling_utils.adapt_transformers_to_gaudi()
+
+assert os.environ.get("GAUDI2_CI", "0") == "1", "Execution does not support on Gaudi1"
+
+MODEL_ID = "meta-llama/Llama-3.2-1B"
+
+
+def get_model(token: str):
+    nf4_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, quantization_config=nf4_config, device_map={"": "hpu"}, torch_dtype=torch.bfloat16, token=token.value
+    )
+
+    return model
+
+
+def test_nf4_quantization_inference(token: str):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token.value)
+
+    model = get_model(token)
+
+    generation_config = copy.deepcopy(model.generation_config)
+    generation_config.max_new_tokens = 20
+    generation_config.use_cache = True
+    generation_config.use_flash_attention = True
+
+    model = wrap_in_hpu_graph(model)
+
+    input_text = "Hello my name is"
+    inputs = tokenizer(input_text, return_tensors="pt").to(device="hpu")
+
+    torch.manual_seed(42)
+    outputs = model.generate(**inputs, generation_config=generation_config, hpu_graphs=True, lazy_mode=True)
+    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    assert decoded_output == "Hello my name is Marlene and I am 36 years old. I am a very happy person, I love to"
diff --git a/tests/test_bnb_qlora.py b/tests/test_bnb_qlora.py
new file mode 100644
index 0000000000..ac33a74ee1
--- /dev/null
+++ b/tests/test_bnb_qlora.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2022 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+
+import pytest
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling
+
+from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
+from optimum.habana.transformers import modeling_utils
+
+
+modeling_utils.adapt_transformers_to_gaudi()
+
+assert os.environ.get("GAUDI2_CI", "0") == "1", "Execution does not support on Gaudi1"
+try:
+    import sys
+
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "peft==0.12.0"])
+    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+except subprocess.CalledProcessError:
+    pytest.fail("Failed to install peft==0.12.0")
+
+MODEL_ID = "meta-llama/Llama-3.2-1B"
+
+
+def print_model_size(model):
+    """
+    Prints the model size in GB.
+    """
+    model_size = sum(p.numel() * p.element_size() for p in model.parameters())
+    model_size_GB = model_size / (1024**3)
+    print(f" Model size : {model_size_GB} GB")
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def get_data(tokenizer, dataset_name):
+    dataset = load_dataset(dataset_name)
+    dataset = dataset.shuffle(seed=42)
+    data = dataset.map(lambda example: tokenizer(example["text"]), batched=True)
+    split_data = data["train"].train_test_split(test_size=0.1, seed=42)
+
+    return split_data
+
+
+def get_model(token: str):
+    nf4_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, quantization_config=nf4_config, device_map={"": "hpu"}, torch_dtype=torch.bfloat16, token=token.value
+    )
+
+    return model
+
+
+def test_nf4_quantization_inference(token: str):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=token.value)
+    # needed for llama tokenizer
+    tokenizer.pad_token = tokenizer.eos_token
+
+    model = get_model(token)
+    model.gradient_checkpointing_enable()
+    print_model_size(model)
+
+    model = prepare_model_for_kbit_training(model)
+
+    config = LoraConfig(
+        r=4,
+        lora_alpha=64,
+        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+
+    model = get_peft_model(model, config)
+    print_trainable_parameters(model)
+
+    data = get_data(tokenizer, dataset_name="tatsu-lab/alpaca")
+
+    gaudi_config = GaudiConfig(
+        use_fused_adam=True,
+        use_fused_clip_norm=True,
+        use_torch_autocast=True,
+    )
+
+    training_args = GaudiTrainingArguments(
+        evaluation_strategy="steps",
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=8,
+        gradient_accumulation_steps=2,
+        max_steps=5,
+        eval_steps=3,
+        warmup_steps=3,
+        learning_rate=2e-4,
+        logging_steps=1,
+        output_dir="results",
+        lr_scheduler_type="linear",
+        use_habana=True,
+        use_lazy_mode=True,
+        pipelining_fwd_bwd=True,
+    )
+
+    trainer = GaudiTrainer(
+        model=model,
+        train_dataset=data["train"],
+        eval_dataset=data["test"],
+        gaudi_config=gaudi_config,
+        args=training_args,
+        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+    )
+    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+
+    trainer.train()
+    eval_loss = trainer.evaluate()["eval_loss"]
+
+    expected_eval_loss = 1.638
+
+    assert abs(eval_loss - expected_eval_loss) < 5e-2
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
old mode 100755
new mode 100644
index 97bbb7632d..534c917cfe
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -25,15 +25,14 @@
 import shutil
 import subprocess
 import tempfile
-import time
 from io import BytesIO, StringIO
 from pathlib import Path
 from typing import Callable, Union
-from unittest import TestCase, skipIf, skipUnless
+from unittest import TestCase, skipUnless
 
 import diffusers
-import habana_frameworks.torch.hpu as hthpu
 import numpy as np
+import PIL
 import pytest
 import requests
 import safetensors
@@ -44,7 +43,6 @@
     AutoencoderTiny,
     ControlNetModel,
     DiffusionPipeline,
-    DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
     FlowMatchEulerDiscreteScheduler,
@@ -63,17 +61,16 @@
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import logging, numpy_to_pil
-from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
+from diffusers.utils import logging
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
     floats_tensor,
     load_image,
-    load_numpy,
     require_torch,
 )
 from diffusers.utils.torch_utils import randn_tensor
-from huggingface_hub import snapshot_download
+from huggingface_hub import HfApi, hf_hub_download, snapshot_download
+from huggingface_hub.utils import HfHubHTTPError
 from parameterized import parameterized
 from PIL import Image
 from transformers import (
@@ -135,32 +132,32 @@
     THROUGHPUT_BASELINE_AUTOCAST = 0.394
     TEXTUAL_INVERSION_THROUGHPUT = 131.7606336456344
     TEXTUAL_INVERSION_RUNTIME = 1.542460777796805
-    CONTROLNET_THROUGHPUT = 120.123522340414
-    CONTROLNET_RUNTIME = 1.8647471838630736
-    INPAINT_THROUGHPUT_BASELINE_BF16 = 4.584
-    INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 1.151
-    TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 70
-    DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.946
-    THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 0.15186785472532677
-    DEPTH2IMG_GENERATION_LATENCY_BASELINE_BF16 = 36.06376791000366
     TEXTUAL_INVERSION_SDXL_THROUGHPUT = 2.6694
     TEXTUAL_INVERSION_SDXL_RUNTIME = 74.92
+    CONTROLNET_THROUGHPUT = 120.123522340414
+    CONTROLNET_RUNTIME = 1.8647471838630736
+    INPAINT_THROUGHPUT_BASELINE_BF16 = 1.025
+    INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 0.175
+    THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 0.145
+    SDXL_THROUGHPUT = 0.301
+    SVD_THROUGHPUT = 0.012
+    SD3_THROUGHPUT = 0.006
     FLUX_THROUGHPUT = 0.03
+    FLUX_DEV_I2I_THROUGHPUT = 0.12
 else:
-    THROUGHPUT_BASELINE_BF16 = 0.309
+    THROUGHPUT_BASELINE_BF16 = 0.275
     THROUGHPUT_BASELINE_AUTOCAST = 0.114
     TEXTUAL_INVERSION_THROUGHPUT = 122.7445217395719
     TEXTUAL_INVERSION_RUNTIME = 1.8249286960053723
-    CONTROLNET_THROUGHPUT = 78.51566937458146
-    CONTROLNET_RUNTIME = 2.852933710993966
-    INPAINT_THROUGHPUT_BASELINE_BF16 = 1.42
-    INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 0.271
-    DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.302
-    THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 0.050208662346013566
-    TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 1000  # TODO: Get Gaudi 1 benchmark numbers
-    DEPTH2IMG_GENERATION_LATENCY_BASELINE_BF16 = 200  # TODO: Get Gaudi 1 Throughput
     TEXTUAL_INVERSION_SDXL_THROUGHPUT = 2.695
     TEXTUAL_INVERSION_SDXL_RUNTIME = 74.19
+    CONTROLNET_THROUGHPUT = 78.51566937458146
+    CONTROLNET_RUNTIME = 2.852933710993966
+    INPAINT_THROUGHPUT_BASELINE_BF16 = 0.272
+    INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 0.042
+    THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 0.045
+    SDXL_THROUGHPUT = 0.074
+    SVD_THROUGHPUT = 0.012
 
 
 _run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False)
@@ -177,6 +174,35 @@ def custom_bf16_ops(test_case):
     return skipUnless(_run_custom_bf16_ops_test_, "test requires custom bf16 ops")(test_case)
 
 
+def check_gated_model_access(model):
+    """
+    Skip test for a gated model if access is not granted; this occurs when an account
+    with the required permissions is not logged into the HF Hub.
+    """
+    try:
+        hf_hub_download(repo_id=model, filename=HfApi().model_info(model).siblings[0].rfilename)
+        gated = False
+
+    except HfHubHTTPError:
+        gated = True
+
+    return pytest.mark.skipif(gated, reason=f"{model} is gated, please log in with approved HF access token")
+
+
+def check_8xhpu(test_case):
+    """
+    Decorator marking a test as it requires 8xHPU on system
+    """
+    from optimum.habana.utils import get_device_count
+
+    if get_device_count() != 8:
+        skip = True
+    else:
+        skip = False
+
+    return pytest.mark.skipif(skip, reason="test requires 8xHPU multi-card system")(test_case)
+
+
 class GaudiPipelineUtilsTester(TestCase):
     """
     Tests the features added on top of diffusers/pipeline_utils.py.
@@ -414,52 +440,6 @@ def test_stable_diffusion_output_types(self, output_type):
         )
 
         self.assertEqual(len(outputs.images), 2 * 3)
-        # TODO: enable safety checker
-        # if output_type == "latent":
-        #     self.assertIsNone(outputs.nsfw_content_detected)
-        # else:
-        #     self.assertEqual(len(outputs.nsfw_content_detected), 2 * 3)
-
-    # TODO: enable this test when PNDMScheduler is adapted to Gaudi
-    # def test_stable_diffusion_negative_prompt(self):
-    #     device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-    #     unet = self.dummy_cond_unet
-    #     scheduler = PNDMScheduler(skip_prk_steps=True)
-    #     vae = self.dummy_vae
-    #     bert = self.dummy_text_encoder
-    #     tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-    #     # make sure here that pndm scheduler skips prk
-    #     sd_pipe = StableDiffusionPipeline(
-    #         unet=unet,
-    #         scheduler=scheduler,
-    #         vae=vae,
-    #         text_encoder=bert,
-    #         tokenizer=tokenizer,
-    #         safety_checker=None,
-    #         feature_extractor=self.dummy_extractor,
-    #     )
-    #     sd_pipe = sd_pipe.to(device)
-    #     sd_pipe.set_progress_bar_config(disable=None)
-
-    #     prompt = "A painting of a squirrel eating a burger"
-    #     negative_prompt = "french fries"
-    #     generator = torch.Generator(device=device).manual_seed(0)
-    #     output = sd_pipe(
-    #         prompt,
-    #         negative_prompt=negative_prompt,
-    #         generator=generator,
-    #         guidance_scale=6.0,
-    #         num_inference_steps=2,
-    #         output_type="np",
-    #     )
-
-    #     image = output.images
-    #     image_slice = image[0, -3:, -3:, -1]
-
-    #     assert image.shape == (1, 128, 128, 3)
-    #     expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
-    #     assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_stable_diffusion_num_images_per_prompt(self):
         components = self.get_dummy_components()
@@ -645,13 +625,11 @@ def test_stable_diffusion_hpu_graphs(self):
     def test_no_throughput_regression_bf16(self):
         prompts = [
             "An image of a squirrel in Picasso style",
-            "High quality photo of an astronaut riding a horse in space",
         ]
-        num_images_per_prompt = 11
-        batch_size = 4
+        num_images_per_prompt = 28
+        batch_size = 7
         model_name = "CompVis/stable-diffusion-v1-4"
         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
         pipeline = GaudiStableDiffusionPipeline.from_pretrained(
             model_name,
             scheduler=scheduler,
@@ -659,6 +637,7 @@ def test_no_throughput_regression_bf16(self):
             use_hpu_graphs=True,
             gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
             torch_dtype=torch.bfloat16,
+            sdp_on_bf16=True,
         )
         pipeline.unet.set_default_attn_processor(pipeline.unet)
         set_seed(27)
@@ -666,28 +645,48 @@ def test_no_throughput_regression_bf16(self):
             prompt=prompts,
             num_images_per_prompt=num_images_per_prompt,
             batch_size=batch_size,
+            output_type="np",
         )
+
+        # Check expected number of output images
         self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
+
+        # Throughput regression test
         self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_BF16)
 
+        n = 0
+        clip_score_avg = 0.0
+        for i in range(len(outputs.images)):
+            # Check expected shape for each output image
+            self.assertEqual(outputs.images[i].shape, (512, 512, 3))
+
+            if np.any(outputs.images[i] != 0):
+                clip_score = calculate_clip_score(np.expand_dims(outputs.images[i], axis=0), prompts)
+                clip_score_avg += clip_score
+                n += 1
+
+        # Quality test (check that the average CLIP score of valid output images is well in the 30s range)
+        clip_score_avg /= n
+        CLIP_SCORE_THRESHOLD = 30.0
+        self.assertGreaterEqual(clip_score_avg, CLIP_SCORE_THRESHOLD)
+
     @custom_bf16_ops
     @slow
     def test_no_throughput_regression_autocast(self):
         prompts = [
             "An image of a squirrel in Picasso style",
-            "High quality photo of an astronaut riding a horse in space",
         ]
-        num_images_per_prompt = 11
-        batch_size = 4
+        num_images_per_prompt = 28
+        batch_size = 7
         model_name = "stabilityai/stable-diffusion-2-1"
         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
         pipeline = GaudiStableDiffusionPipeline.from_pretrained(
             model_name,
             scheduler=scheduler,
             use_habana=True,
             use_hpu_graphs=True,
             gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion-2"),
+            sdp_on_bf16=True,
         )
         set_seed(27)
         outputs = pipeline(
@@ -697,54 +696,22 @@ def test_no_throughput_regression_autocast(self):
             height=768,
             width=768,
         )
-        self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
-        self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_AUTOCAST)
-
-    @slow
-    def test_no_generation_regression(self):
-        seed = 27
-        set_seed(seed)
-        model_name = "CompVis/stable-diffusion-v1-4"
-        # fp32
-        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-        pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            safety_checker=None,
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig(use_torch_autocast=False),
-        )
-        pipeline.unet.set_default_attn_processor(pipeline.unet)
-
-        prompt = "An image of a squirrel in Picasso style"
-        generator = torch.manual_seed(seed)
-        outputs = pipeline(
-            prompt=prompt,
-            generator=generator,
-            output_type="np",
-        )
-
-        if IS_GAUDI2:
-            target_score = 29.8925
-        else:
-            target_score = 36.774
-
-        image = outputs.images[0]
-        pil_image = numpy_to_pil(image)[0]
-        pil_image.save("test_no_generation_regression_output.png")
 
-        clip_score = calculate_clip_score(np.expand_dims(image, axis=0), [prompt])
+        # Check expected number of output images
+        self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
 
-        self.assertEqual(image.shape, (512, 512, 3))
-        self.assertGreaterEqual(clip_score, 0.95 * target_score)
+        # Throughput regression test
+        self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_AUTOCAST)
 
+    @custom_bf16_ops
     @slow
     def test_no_generation_regression_ldm3d(self):
-        seed = 27
-        set_seed(seed)
+        prompts = [
+            "An image of a squirrel in Picasso style",
+        ]
+        num_images_per_prompt = 28
+        batch_size = 7
         model_name = "Intel/ldm3d-4c"
-        # fp32
         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
         pipeline = GaudiStableDiffusionLDM3DPipeline.from_pretrained(
             model_name,
@@ -752,35 +719,44 @@ def test_no_generation_regression_ldm3d(self):
             safety_checker=None,
             use_habana=True,
             use_hpu_graphs=True,
-            gaudi_config=GaudiConfig(),
+            gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion-2"),
+            sdp_on_bf16=True,
         )
-
-        prompt = "An image of a squirrel in Picasso style"
-        generator = torch.manual_seed(seed)
+        set_seed(27)
         outputs = pipeline(
-            prompt=prompt,
-            generator=generator,
+            prompt=prompts,
+            num_images_per_prompt=num_images_per_prompt,
+            batch_size=batch_size,
             output_type="np",
         )
 
-        if IS_GAUDI2:
-            target_score = 28.0894
-        else:
-            target_score = 35.81
+        # Check expected number of output images
+        self.assertEqual(len(outputs.rgb), num_images_per_prompt * len(prompts))
+        self.assertEqual(len(outputs.depth), num_images_per_prompt * len(prompts))
+
+        # Throughput regression test
+        self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_AUTOCAST)
 
-        rgb = outputs.rgb[0]
-        depth = outputs.depth[0]
+        n = 0
+        clip_score_avg = 0.0
+        for i in range(len(outputs.rgb)):
+            # Check expected shape for each output image
+            self.assertEqual(outputs.rgb[i].shape, (512, 512, 3))
+            self.assertEqual(outputs.depth[i].shape, (512, 512, 1))
 
-        rgb_clip_score = calculate_clip_score(np.expand_dims(rgb, axis=0), [prompt])
+            if np.any(outputs.rgb[i] != 0):
+                clip_score = calculate_clip_score(np.expand_dims(outputs.rgb[i], axis=0), prompts)
+                clip_score_avg += clip_score
+                n += 1
 
-        self.assertEqual(rgb.shape, (512, 512, 3))
-        self.assertEqual(depth.shape, (512, 512, 1))
-        self.assertGreaterEqual(rgb_clip_score, 0.95 * target_score)
+        # Quality test (check that the average CLIP score of valid output images is well in the 30s range)
+        clip_score_avg /= n
+        CLIP_SCORE_THRESHOLD = 30.0
+        self.assertGreaterEqual(clip_score_avg, CLIP_SCORE_THRESHOLD)
 
     @slow
     def test_no_generation_regression_upscale(self):
         model_name = "stabilityai/stable-diffusion-x4-upscaler"
-        # fp32
         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
         pipeline = GaudiStableDiffusionUpscalePipeline.from_pretrained(
             model_name,
@@ -788,49 +764,38 @@ def test_no_generation_regression_upscale(self):
             use_habana=True,
             use_hpu_graphs=True,
             gaudi_config=GaudiConfig(use_torch_autocast=False),
+            sdp_on_bf16=True,
         )
         set_seed(27)
-
         url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
         response = requests.get(url)
         low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
         low_res_img = low_res_img.resize((128, 128))
         prompt = "a white cat"
         upscaled_image = pipeline(prompt=prompt, image=low_res_img, output_type="np").images[0]
-        if IS_GAUDI2:
-            expected_slice = np.array(
-                [
-                    0.16527882,
-                    0.161616,
-                    0.15665859,
-                    0.1660901,
-                    0.1594379,
-                    0.14936888,
-                    0.1578255,
-                    0.15342498,
-                    0.14590919,
-                ]
-            )
-        else:
-            expected_slice = np.array(
-                [
-                    0.1652787,
-                    0.16161594,
-                    0.15665877,
-                    0.16608998,
-                    0.1594378,
-                    0.14936894,
-                    0.15782538,
-                    0.15342498,
-                    0.14590913,
-                ]
-            )
+
+        # Check expected shape of the upscaled image
         self.assertEqual(upscaled_image.shape, (512, 512, 3))
+
+        # Check expected upscaled values of a sample slice
+        expected_slice = np.array(
+            [
+                0.16528079,
+                0.16161581,
+                0.15665841,
+                0.16609294,
+                0.15943781,
+                0.14936810,
+                0.15782778,
+                0.15342544,
+                0.14590860,
+            ]
+        )
         self.assertLess(np.abs(expected_slice - upscaled_image[-3:, -3:, -1].flatten()).max(), 5e-3)
 
     @slow
-    @pytest.mark.skipif(hthpu.is_available() and hthpu.device_count() != 8, reason="system does not have 8 cards")
-    def test_textual_inversion(self):
+    @check_8xhpu
+    def test_sd_textual_inversion(self):
         path_to_script = (
             Path(os.path.dirname(__file__)).parent
             / "examples"
@@ -894,17 +859,13 @@ def test_textual_inversion(self):
                     use_habana=True,
                     use_hpu_graphs=True,
                     gaudi_config=GaudiConfig(use_habana_mixed_precision=False),
+                    sdp_on_bf16=True,
                 )
                 prompt = "A <cat-toy> backpack"
                 set_seed(27)
                 image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5, output_type="np").images[0]
 
-                # TODO: see how to generate images in a reproducible way
-                # expected_slice = np.array(
-                #     [0.57421875, 0.5703125, 0.58203125, 0.58203125, 0.578125, 0.5859375, 0.578125, 0.57421875, 0.56640625]
-                # )
                 self.assertEqual(image.shape, (512, 512, 3))
-                # self.assertLess(np.abs(expected_slice - image[-3:, -3:, -1].flatten()).max(), 5e-3)
 
 
 class GaudiStableDiffusionXLPipelineTester(TestCase):
@@ -1212,7 +1173,7 @@ def test_stable_diffusion_xl_bf16(self):
         self.assertEqual(image.shape, (64, 64, 3))
 
     @slow
-    def test_textual_inversion_sdxl(self):
+    def test_sdxl_textual_inversion(self):
         path_to_script = (
             Path(os.path.dirname(__file__)).parent
             / "examples"
@@ -1247,6 +1208,7 @@ def test_textual_inversion_sdxl(self):
                     f"--output_dir {run_dir}",
                     "--save_as_full_pipeline",
                     "--gaudi_config_name Habana/stable-diffusion",
+                    "--sdp_on_bf16",
                     "--throughput_warmup_steps 3",
                     "--seed 27",
                 ]
@@ -1272,6 +1234,7 @@ def test_textual_inversion_sdxl(self):
                     use_habana=True,
                     use_hpu_graphs=True,
                     gaudi_config=GaudiConfig(use_habana_mixed_precision=False),
+                    sdp_on_bf16=True,
                 )
 
                 set_seed(27)
@@ -1332,76 +1295,36 @@ def test_stable_diffusion_xl_hpu_graphs(self):
         self.assertEqual(images[-1].shape, (64, 64, 3))
 
     @slow
-    def test_stable_diffusion_xl_inference_script(self):
-        path_to_script = (
-            Path(os.path.dirname(__file__)).parent / "examples" / "stable-diffusion" / "text_to_image_generation.py"
-        )
-
-        with tempfile.TemporaryDirectory() as run_dir:
-            cmd_line = f"""
-                python3
-                {path_to_script}
-                --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0
-                --num_images_per_prompt 1
-                --num_inference_steps 30
-                --batch_size 1
-                --image_save_dir {run_dir}
-                --use_habana
-                --gaudi_config Habana/stable-diffusion
-                --bf16
-                """.split()
-            cmd_line.append("--prompts")
-            cmd_line.append("Sailing ship painting by Van Gogh")
-
-            # Run textual inversion
-            p = subprocess.Popen(cmd_line)
-            return_code = p.wait()
-
-            # Ensure the run finished without any issue
-            self.assertEqual(return_code, 0)
-
-    if IS_GAUDI2:
-        _sdxl_inferece_throughput_data = (("ddim", 1, 10, 0.301), ("euler_discrete", 1, 10, 0.301))
-    else:
-        _sdxl_inferece_throughput_data = (("ddim", 1, 10, 0.074),)
+    def test_stable_diffusion_xl_generation_throughput(self):
+        prompts = [
+            "Sailing ship painting by Van Gogh",
+        ]
+        num_images_per_prompt = 28
+        batch_size = 7
+        model_name = "stabilityai/stable-diffusion-xl-base-1.0"
+        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 
-    @parameterized.expand(_sdxl_inferece_throughput_data, skip_on_empty=True)
-    def test_stable_diffusion_xl_generation_throughput(
-        self, scheduler: str, batch_size: int, num_images_per_prompt: int, baseline: float
-    ):
-        def _sdxl_generation(self, scheduler: str, batch_size: int, num_images_per_prompt: int, baseline: float):
-            kwargs = {"timestep_spacing": "linspace"}
-            if scheduler == "euler_discrete":
-                scheduler = GaudiEulerDiscreteScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler", **kwargs
-                )
-            elif scheduler == "ddim":
-                scheduler = GaudiDDIMScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler", **kwargs
-                )
+        pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
+            model_name,
+            scheduler=scheduler,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
+            sdp_on_bf16=True,
+        )
+        set_seed(27)
+        outputs = pipeline(
+            prompt=prompts,
+            num_images_per_prompt=num_images_per_prompt,
+            batch_size=batch_size,
+            num_inference_steps=30,
+        )
 
-            kwargs = {
-                "scheduler": scheduler,
-                "use_habana": True,
-                "use_hpu_graphs": True,
-                "gaudi_config": "Habana/stable-diffusion",
-            }
-            pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
-                "stabilityai/stable-diffusion-xl-base-1.0",
-                **kwargs,
-            )
-            num_images_per_prompt = num_images_per_prompt
-            res = {}
-            outputs = pipeline(
-                prompt="Sailing ship painting by Van Gogh",
-                num_images_per_prompt=num_images_per_prompt,
-                batch_size=batch_size,
-                num_inference_steps=30,
-                **res,
-            )
-            self.assertGreaterEqual(outputs.throughput, 0.95 * baseline)
+        # Check expected number of output images
+        self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
 
-        _sdxl_generation(self, scheduler, batch_size, num_images_per_prompt, baseline)
+        # Throughput regression test
+        self.assertGreaterEqual(outputs.throughput, 0.95 * SDXL_THROUGHPUT)
 
 
 class GaudiStableDiffusion3PipelineTester(TestCase):
@@ -1614,15 +1537,41 @@ def test_fused_qkv_projections(self):
         image = pipe(**inputs).images
         image_slice_disabled = image[0, -3:, -3:, -1]
 
-        assert np.allclose(
-            original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3
-        ), "Fusion of QKV projections shouldn't affect the outputs."
-        assert np.allclose(
-            image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3
-        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
-        assert np.allclose(
-            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
-        ), "Original outputs should match when fused QKV projections are disabled."
+        assert np.allclose(original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3), (
+            "Fusion of QKV projections shouldn't affect the outputs."
+        )
+        assert np.allclose(image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3), (
+            "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        )
+        assert np.allclose(original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2), (
+            "Original outputs should match when fused QKV projections are disabled."
+        )
+
+    @slow
+    @check_gated_model_access("stabilityai/stable-diffusion-3-medium-diffusers")
+    @pytest.mark.skipif(not IS_GAUDI2, reason="does not fit into Gaudi1 memory")
+    def test_sd3_inference(self):
+        repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
+
+        pipe = self.pipeline_class.from_pretrained(
+            repo_id,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config="Habana/stable-diffusion",
+            torch_dtype=torch.bfloat16,
+            sdp_on_bf16=True,
+        )
+
+        outputs = pipe(
+            prompt="Sailing ship painting by Van Gogh",
+            num_inference_steps=28,
+            batch_size=1,
+            num_images_per_prompt=10,
+            output_type="np",
+        )
+
+        # Check expected performance of FLUX.1 dev img-to-img model
+        self.assertGreaterEqual(outputs.throughput, 0.95 * SD3_THROUGHPUT)
 
 
 class GaudiStableDiffusionControlNetPipelineTester(TestCase):
@@ -2375,13 +2324,18 @@ def test_depth2img_pipeline_hpu_graphs(self):
         assert images[0].shape == (32, 32, 3)
 
     @slow
-    def test_depth2img_pipeline_latency_bf16(self):
+    def test_depth2img_pipeline(self):
         gaudi_config = GaudiConfig(use_torch_autocast=True)
         model_name = "stabilityai/stable-diffusion-2-depth"
         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 
         pipe = GaudiStableDiffusionDepth2ImgPipeline.from_pretrained(
-            model_name, gaudi_config=gaudi_config, scheduler=scheduler, use_habana=True, use_hpu_graphs=True
+            model_name,
+            scheduler=scheduler,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config=gaudi_config,
+            sdp_on_bf16=True,
         )
         image = Image.open(
             requests.get(
@@ -2391,7 +2345,6 @@ def test_depth2img_pipeline_latency_bf16(self):
         )
         prompt = "A fancy meal with soup and pancakes"
 
-        start_time = time.time()
         outputs = pipe(
             prompt=prompt,
             image=image,
@@ -2399,8 +2352,7 @@ def test_depth2img_pipeline_latency_bf16(self):
             num_inference_steps=50,
             output_type="np",
         )
-        end_time = time.time()
-        latency = end_time - start_time
+
         images = outputs.images
         clip_score = calculate_clip_score(np.expand_dims(image, axis=0), [prompt])
         target_score = 22.76
@@ -2409,8 +2361,6 @@ def test_depth2img_pipeline_latency_bf16(self):
         self.assertEqual(images[0].shape, (512, 512, 3))
         self.assertGreaterEqual(clip_score, 0.95 * target_score)
 
-        self.assertLessEqual(latency, 1.05 * DEPTH2IMG_GENERATION_LATENCY_BASELINE_BF16)
-
 
 class TrainTextToImage(TestCase):
     """
@@ -2466,6 +2416,7 @@ def test_train_text_to_image_sdxl(self):
                  --dataloader_num_workers 8
                  --use_hpu_graphs_for_training
                  --use_hpu_graphs_for_inference
+                 --sdp_on_bf16
                  --bf16
                  --adjust_throughput
                  --center_crop
@@ -2474,7 +2425,7 @@ def test_train_text_to_image_sdxl(self):
                  --output_dir {tmpdir}
                 """.split()
 
-            # Run train_text_to_image_sdxl.y
+            # Run train_text_to_image_sdxl.py
             p = subprocess.Popen(cmd_line)
             return_code = p.wait()
 
@@ -2493,7 +2444,7 @@ class TrainControlNet(TestCase):
     Tests the train_controlnet.py script for Gaudi.
     """
 
-    def test_train_controlnet_script(self):
+    def test_script_train_controlnet(self):
         path_to_script = (
             Path(os.path.dirname(__file__)).parent
             / "examples"
@@ -2512,7 +2463,7 @@ def test_train_controlnet_script(self):
         self.assertEqual(return_code, 0)
 
     @slow
-    @pytest.mark.skipif(hthpu.is_available() and hthpu.device_count() != 8, reason="system does not have 8 cards")
+    @check_8xhpu
     def test_train_controlnet(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             path_to_script = (
@@ -2533,7 +2484,7 @@ def test_train_controlnet(self):
 
             cmd_line = f"""
                     python3
-                    {path_to_script.parent.parent.parent / 'gaudi_spawn.py'}
+                    {path_to_script.parent.parent.parent / "gaudi_spawn.py"}
                     --use_mpi
                     --world_size 8
                     {path_to_script}
@@ -2548,6 +2499,7 @@ def test_train_controlnet(self):
                     --checkpointing_steps 1000
                     --throughput_warmup_steps 3
                     --use_hpu_graphs
+                    --sdp_on_bf16
                     --bf16
                     --max_train_steps 10
                     --output_dir {tmpdir}
@@ -2576,6 +2528,7 @@ def test_train_controlnet(self):
                 use_habana=True,
                 use_hpu_graphs=True,
                 gaudi_config=GaudiConfig(use_habana_mixed_precision=False),
+                sdp_on_bf16=True,
             )
             pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
 
@@ -2620,7 +2573,7 @@ def _test_dreambooth(self, extra_config, train_text_encoder=False):
                 python3
                 {path_to_script}
                 --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --instance_data_dir {Path(os.path.dirname(__file__))/'resource/img'}
+                --instance_data_dir {Path(os.path.dirname(__file__)) / "resource/img"}
                 --resolution 64
                 --train_batch_size 1
                 --gradient_accumulation_steps 1
@@ -2716,7 +2669,7 @@ def _test_dreambooth_lora_sdxl(self, train_text_encoder=False):
                 python3
                 {path_to_script}
                 --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --instance_data_dir {Path(os.path.dirname(__file__))/'resource/img'}
+                --instance_data_dir {Path(os.path.dirname(__file__)) / "resource/img"}
                 --resolution 64
                 --train_batch_size 1
                 --gradient_accumulation_steps 1
@@ -2864,13 +2817,10 @@ def test_stable_video_diffusion_single_video(self):
         components = self.get_dummy_components()
         gaudi_config = GaudiConfig(use_torch_autocast=False)
         sd_pipe_oh = GaudiStableVideoDiffusionPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
-        sd_pipe_hf = StableVideoDiffusionPipeline(**components)
+        components2 = self.get_dummy_components()
+        sd_pipe_hf = StableVideoDiffusionPipeline(**components2)
 
         def _get_image_from_pipeline(pipeline, device=device):
-            for component in pipeline.components.values():
-                if hasattr(component, "set_default_attn_processor"):
-                    component.set_default_attn_processor()
-
             pipeline.to(device)
             pipeline.set_progress_bar_config(disable=None)
 
@@ -2883,7 +2833,7 @@ def _get_image_from_pipeline(pipeline, device=device):
             self.assertEqual(image.shape, (2, 3, 32, 32))
             return image[0, -3:, -3:, -1]
 
-        image_slice_oh = _get_image_from_pipeline(sd_pipe_oh)
+        image_slice_oh = _get_image_from_pipeline(sd_pipe_oh, device="hpu").cpu()
         image_slice_hf = _get_image_from_pipeline(sd_pipe_hf)
 
         self.assertLess(np.abs(image_slice_oh.flatten() - image_slice_hf.flatten()).max(), 1e-2)
@@ -2903,6 +2853,7 @@ def test_stable_video_diffusion_no_throughput_regression_bf16(self):
             use_hpu_graphs=True,
             gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
             torch_dtype=torch.bfloat16,
+            sdp_on_bf16=True,
         )
         set_seed(42)
         prompt_image = load_image(image_url)
@@ -2914,9 +2865,11 @@ def test_stable_video_diffusion_no_throughput_regression_bf16(self):
             width=256,
         )
 
+        # Check expected number of output frames
         self.assertEqual(len(outputs.frames[0]), 25)
-        if IS_GAUDI2:
-            self.assertGreaterEqual(outputs.throughput, 0.95 * 0.012)
+
+        # Throughput regression test
+        self.assertGreaterEqual(outputs.throughput, 0.95 * SVD_THROUGHPUT)
 
 
 class GaudiStableVideoDiffusionControlNetPipelineTester(TestCase):
@@ -3692,82 +3645,10 @@ def test_stable_diffusion_xl_img2img_euler(self):
 
         self.assertEqual(image.shape, (1, 32, 32, 3))
 
-        expected_slice = np.array([0.4925, 0.5007, 0.6594, 0.5544, 0.4423, 0.5585, 0.4643, 0.5444, 0.5376])
+        expected_slice = np.array([0.4664, 0.4886, 0.4403, 0.6902, 0.5592, 0.4534, 0.5931, 0.5951, 0.5224])
         self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
 
 
-class GaudiDeterministicImageGenerationTester(TestCase):
-    """
-    Test deterministic generation using text_to_image_generation.py.
-    """
-
-    @slow
-    def test_deterministic_image_generation(self):
-        path_to_script = (
-            Path(os.path.dirname(__file__)).parent / "examples" / "stable-diffusion" / "text_to_image_generation.py"
-        )
-
-        with tempfile.TemporaryDirectory():
-            test_args = f"""
-                python3
-                {path_to_script}
-                --model_name_or_path CompVis/stable-diffusion-v1-4
-                --num_images_per_prompt 20
-                --batch_size 4
-                --image_save_dir /tmp/stable_diffusion_images
-                --use_habana
-                --use_hpu_graphs
-                --gaudi_config Habana/stable-diffusion
-                --bf16
-                --use_cpu_rng
-                """.split()
-            test_args.append("--prompts")
-            test_args.append("An image of a squirrel in Picasso style")
-            p = subprocess.Popen(test_args)
-            return_code = p.wait()
-
-            # Ensure the run finished without any issue
-            self.assertEqual(return_code, 0)
-
-    @slow
-    def test_deterministic_image_generation_no_throughput_regression_bf16(self):
-        kwargs = {"timestep_spacing": "linspace"}
-        scheduler = GaudiDDIMScheduler.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", **kwargs, subfolder="scheduler"
-        )
-
-        kwargs = {
-            "scheduler": scheduler,
-            "use_habana": True,
-            "use_hpu_graphs": True,
-            "gaudi_config": "Habana/stable-diffusion",
-        }
-
-        pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            **kwargs,
-        )
-        pipeline.unet.set_default_attn_processor(pipeline.unet)
-
-        num_images_per_prompt = 20
-        res = {}
-        generator = [set_seed(27) for i in range(num_images_per_prompt)]
-        outputs = pipeline(
-            prompt="An image of a squirrel in Picasso style",
-            num_images_per_prompt=num_images_per_prompt,
-            batch_size=4,
-            num_inference_steps=50,
-            guidance_scale=7.5,
-            negative_prompt=None,
-            eta=0.0,
-            output_type="pil",
-            generator=generator,
-            **res,
-        )
-
-        self.assertGreaterEqual(outputs.throughput, 0.95 * DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT)
-
-
 class GaudiTextToVideoSDPipelineTester(TestCase):
     """
     Tests the TextToVideoSDPipeline for Gaudi.
@@ -3872,6 +3753,9 @@ def test_text_to_video_default_case(self):
 
     @slow
     def test_stable_video_diffusion_no_latency_regression_bf16(self):
+        prompts = [
+            "An astronaut riding a horse",
+        ]
         model_name = "ali-vilab/text-to-video-ms-1.7b"
         pipeline = GaudiTextToVideoSDPipeline.from_pretrained(
             model_name,
@@ -3879,15 +3763,16 @@ def test_stable_video_diffusion_no_latency_regression_bf16(self):
             use_hpu_graphs=True,
             gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
             torch_dtype=torch.bfloat16,
+            sdp_to_bf16=True,
+        )
+
+        set_seed(27)
+        outputs = pipeline(
+            prompt=prompts,
         )
-        set_seed(42)
-        start_time = time.time()
-        prompt = "Spiderman is surfing"
-        outputs = pipeline(prompt, num_inference_steps=50, output_type="pil")
-        latency = time.time() - start_time
-        assert len(outputs.videos[0]) == 16
 
-        assert latency < 1.05 * TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE
+        # Check expected number of output frames
+        self.assertEqual(len(outputs.videos[0]), 16)
 
 
 """
@@ -4346,8 +4231,6 @@ def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
     def test_components_function(self):
         init_components = self.get_dummy_components()
 
-        # init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float))}
-
         pipe = self.pipeline_class(**init_components)
         init_components.pop("use_habana")
         init_components.pop("use_hpu_graphs")
@@ -4357,218 +4240,54 @@ def test_components_function(self):
         self.assertTrue(hasattr(pipe, "components"))
         self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
 
-    @skipIf(torch_device != "cuda", reason="float16 requires CUDA")
-    def test_float16_inference(self, expected_max_diff=5e-2):
+    def test_to_dtype(self):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-
-        pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
+        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(dtype=torch.bfloat16)
+        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.bfloat16 for dtype in model_dtypes))
+
+    def test_attention_slicing_forward_pass(self, expected_max_diff=1e-3):
+        self._test_attention_slicing_forward_pass(expected_max_diff=expected_max_diff)
+
+    def _test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+
         components = self.get_dummy_components()
-        pipe_fp16 = self.pipeline_class(**components)
-        for component in pipe_fp16.components.values():
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
             if hasattr(component, "set_default_attn_processor"):
                 component.set_default_attn_processor()
+        pipe.set_progress_bar_config(disable=None)
 
-        pipe_fp16.to(torch_device, torch.float16)
-        pipe_fp16.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        # Reset generator in case it is used inside dummy inputs
-        if "generator" in inputs:
-            inputs["generator"] = self.get_generator(0)
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
 
-        output = pipe(**inputs)[0]
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing = pipe(**inputs)[0]
 
-        fp16_inputs = self.get_dummy_inputs(torch_device)
-        # Reset generator in case it is used inside dummy inputs
-        if "generator" in fp16_inputs:
-            fp16_inputs["generator"] = self.get_generator(0)
+        if test_max_difference:
+            max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max()
+            self.assertLess(max_diff, expected_max_diff, "Attention slicing should not affect the inference results")
 
-        output_fp16 = pipe_fp16(**fp16_inputs)[0]
+        if test_mean_pixel_difference:
+            assert_mean_pixel_difference(to_np(output_with_slicing[0]), to_np(output_without_slicing[0]))
 
-        max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
-        self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
-
-    @skipIf(torch_device != "cuda", reason="float16 requires CUDA")
-    def test_save_load_float16(self, expected_max_diff=1e-2):
-        components = self.get_dummy_components()
-        for name, module in components.items():
-            if hasattr(module, "half"):
-                components[name] = module.to(torch_device).half()
-
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output = pipe(**inputs)[0]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
-            for component in pipe_loaded.components.values():
-                if hasattr(component, "set_default_attn_processor"):
-                    component.set_default_attn_processor()
-            pipe_loaded.to(torch_device)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        for name, component in pipe_loaded.components.items():
-            if hasattr(component, "dtype"):
-                self.assertTrue(
-                    component.dtype == torch.float16,
-                    f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
-                )
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_loaded = pipe_loaded(**inputs)[0]
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(
-            max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
-        )
-
-    @skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
-    def test_to_device(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe.to("cpu")
-        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
-        self.assertTrue(all(device == "cpu" for device in model_devices))
-
-        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
-        self.assertTrue(np.isnan(output_cpu).sum() == 0)
-
-        pipe.to("cuda")
-        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
-        self.assertTrue(all(device == "cuda" for device in model_devices))
-
-        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
-        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
-
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
-
-        pipe.to(dtype=torch.bfloat16)
-        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.bfloat16 for dtype in model_dtypes))
-
-    def test_attention_slicing_forward_pass(self, expected_max_diff=1e-3):
-        self._test_attention_slicing_forward_pass(expected_max_diff=expected_max_diff)
-
-    def _test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
-    ):
-        if not self.test_attention_slicing:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator_device = "cpu"
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_slicing = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max()
-            self.assertLess(max_diff, expected_max_diff, "Attention slicing should not affect the inference results")
-
-        if test_mean_pixel_difference:
-            assert_mean_pixel_difference(to_np(output_with_slicing[0]), to_np(output_without_slicing[0]))
-
-    @skipIf(
-        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
-        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
-    )
-    def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-
-        pipe.set_progress_bar_config(disable=None)
-
-        generator_device = "cpu"
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_offload = pipe(**inputs)[0]
-
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_offload = pipe(**inputs)[0]
-
-        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
-        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
-
-    @skipIf(
-        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
-        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
-    )
-    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
-        generator_device = "cpu"
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_offload = pipe(**inputs)[0]
-
-        pipe.enable_model_cpu_offload()
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_offload = pipe(**inputs)[0]
-
-        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
-        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
-        offloaded_modules = [
-            v
-            for k, v in pipe.components.items()
-            if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
-        ]
-        (
-            self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)),
-            f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
-        )
-
-    @skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass()
-
-    def _test_xformers_attention_forwardGenerator_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-4
-    ):
-        if not self.test_xformers_attention:
-            return
+    def _test_xformers_attention_forwardGenerator_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-4
+    ):
+        if not self.test_xformers_attention:
+            return
 
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
@@ -5042,20 +4761,10 @@ def assert_mean_pixel_difference(image, expected_image, expected_max_diff=10):
 VIDEO_TO_VIDEO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt", "video"])
 
 
-"""
-Copied from: https://github.com/huggingface/diffusers/blob/v0.26.3/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
-- Modified pipeline to Gaudi pipeline.
-- Modified the get_dummy_components to add the Gaudi pipeline parameters: use_habana, use_hpu_graphs, gaudi_config, bf16_full_eval
-- Added testcases:
-    test_stable_diffusion_inpaint_no_safety_checker
-    test_stable_diffusion_inpaint_enable_safety_checker
-    test_stable_diffusion_inpaint_no_throughput_regression
-"""
-
 enable_full_determinism()
 
 
-class StableDiffusionInpaintPipelineFastTests(
+class StableDiffusionInpaintPipelineTests(
     PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, TestCase
 ):
     pipeline_class = GaudiStableDiffusionInpaintPipeline
@@ -5169,225 +4878,9 @@ def test_stable_diffusion_inpaint(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
-
-class StableDiffusionInpaintPipelineIntegrationTests(TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-
-    def create_inpaint_pipe(
-        self,
-        model_name="stabilityai/stable-diffusion-2-inpainting",
-        scheduler=None,
-        use_hpu_graphs=False,
-        gaudi_config="Habana/stable-diffusion",
-        disable_safety_checker=False,
-        torch_dtype=torch.bfloat16,
-    ):
-        if scheduler is None:
-            scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-        kwargs = {
-            "scheduler": scheduler,
-            "use_habana": True,
-            "use_hpu_graphs": use_hpu_graphs,
-            "gaudi_config": gaudi_config,
-        }
-
-        if disable_safety_checker is True:
-            kwargs["safety_checker"] = None
-
-        sdi_pipe = GaudiStableDiffusionInpaintPipeline.from_pretrained(model_name, **kwargs).to(torch_dtype)
-
-        sdi_pipe.set_progress_bar_config(disable=None)
-
-        return sdi_pipe
-
-    @slow
-    def test_stable_diffusion_inpaint_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
-            "/yellow_cat_sitting_on_a_park_bench.npy"
-        )
-
-        model_id = "stabilityai/stable-diffusion-2-inpainting"
-        init_kwargs = {
-            "use_habana": True,
-            "use_hpu_graphs": True,
-            "gaudi_config": "Habana/stable-diffusion",
-            "torch_dtype": torch.float,
-        }
-
-        pipe = GaudiStableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None, **init_kwargs)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        # There is no difference in the experimental results observed by the human eye.
-        # np.abs(expected_image - image).max() = 0.31966144
-        assert np.abs(expected_image - image).max() < 0.4
-
-    @slow
-    def test_stable_diffusion_inpaint_pipeline_bf16(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
-            "/yellow_cat_sitting_on_a_park_bench_fp16.npy"
-        )
-
-        model_id = "stabilityai/stable-diffusion-2-inpainting"
-        init_kwargs = {
-            "use_habana": True,
-            "use_hpu_graphs": True,
-            "gaudi_config": "Habana/stable-diffusion-2",
-            "torch_dtype": torch.bfloat16,
-        }
-
-        pipe = GaudiStableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None, **init_kwargs)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        # The format of expected_image used for testing is only float16. There is no difference in the experimental results observed by the human eye.
-        # np.abs(expected_image - image).max() = 0.9626465
-        assert np.abs(expected_image - image).max() < 0.97
-
-    @slow
-    def test_stable_diffusion_inpaint_no_safety_checker(self):
-        """Test that stable diffusion inpainting works without a saftey checker"""
-        from diffusers.utils import load_image
-
-        # Create test inpaint pipeline
-        gaudi_config = GaudiConfig()
-        scheduler = GaudiDDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        sdi_pipe = self.create_inpaint_pipe(
-            gaudi_config=gaudi_config, scheduler=scheduler, disable_safety_checker=True
-        )
-
-        # Initialize inpaint parameters
-        init_image = load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png"
-        )
-
-        self.assertIsInstance(sdi_pipe, GaudiStableDiffusionInpaintPipeline)
-        self.assertIsInstance(sdi_pipe.scheduler, GaudiDDIMScheduler)
-        self.assertIsNone(sdi_pipe.safety_checker)
-
-        image = sdi_pipe("example prompt", image=init_image, mask_image=mask_image, num_inference_steps=2).images[0]
-        self.assertIsNotNone(image)
-
-        # Check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sdi_pipe.save_pretrained(tmpdirname)
-            sdi_pipe = GaudiStableDiffusionInpaintPipeline.from_pretrained(
-                tmpdirname,
-                use_habana=True,
-                gaudi_config=tmpdirname,
-            )
-
-        # Sanity check that the pipeline still works
-        self.assertIsNone(sdi_pipe.safety_checker)
-        image = sdi_pipe("example prompt", image=init_image, mask_image=mask_image, num_inference_steps=2).images[0]
-        self.assertIsNotNone(image)
-
-    @slow
-    def test_stable_diffusion_inpaint_enable_safety_checker(self):
-        """Test that stable diffusion inpainting works with a saftey checker and it is loaded from_pretrained"""
-        from diffusers.utils import load_image
-
-        # Create test inpaint pipeline
-        gaudi_config = GaudiConfig()
-        scheduler = GaudiDDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        sdi_pipe = self.create_inpaint_pipe(
-            gaudi_config=gaudi_config, scheduler=scheduler, disable_safety_checker=False
-        )
-
-        # Initialize inpaint parameters
-        init_image = load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png"
-        )
-
-        self.assertIsInstance(sdi_pipe, GaudiStableDiffusionInpaintPipeline)
-        self.assertIsInstance(sdi_pipe.scheduler, GaudiDDIMScheduler)
-        # self.assertIsNotNone(sdi_pipe.safety_checker) <--- The safety checker is not being found.
-
-        image = sdi_pipe("example prompt", image=init_image, mask_image=mask_image, num_inference_steps=2).images[0]
-        self.assertIsNotNone(image)
-
-        # Check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sdi_pipe.save_pretrained(tmpdirname)
-            sdi_pipe = GaudiStableDiffusionInpaintPipeline.from_pretrained(
-                tmpdirname,
-                use_habana=True,
-                gaudi_config=tmpdirname,
-            )
-
-        # Sanity check that the pipeline still works
-        self.assertIsNone(sdi_pipe.safety_checker)
-        image = sdi_pipe("example prompt", image=init_image, mask_image=mask_image, num_inference_steps=2).images[0]
-        self.assertIsNotNone(image)
-
     @slow
     def test_stable_diffusion_inpaint_no_throughput_regression(self):
         """Test that stable diffusion inpainting no throughput regression autocast"""
-        from diffusers.utils import load_image
 
         # Initialize inpaint parameters
         init_image = load_image(
@@ -5398,45 +4891,38 @@ def test_stable_diffusion_inpaint_no_throughput_regression(self):
         )
 
         prompts = [
-            "a black cat with glowing eyes, cute, adorable, disney, pixar, highly detailed, 8k",
             "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k",
         ]
-        num_images_per_prompt = 10
-        num_inference_steps = 10
         model_name = "stabilityai/stable-diffusion-2-inpainting"
+        num_images_per_prompt = 12
+        batch_size = 4
+        pipeline = GaudiStableDiffusionInpaintPipeline.from_pretrained(
+            model_name,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config="Habana/stable-diffusion",
+            torch_dtype=torch.bfloat16,
+            sdp_on_bf16=True,
+        )
 
-        init_kwargs = {
-            "use_habana": True,
-            "use_hpu_graphs": True,
-            "gaudi_config": "Habana/stable-diffusion",
-            "torch_dtype": torch.bfloat16,
-        }
-        sdi_pipe = GaudiStableDiffusionInpaintPipeline.from_pretrained(model_name, **init_kwargs)
-
-        set_seed(0)
-        outputs = sdi_pipe(
+        set_seed(27)
+        outputs = pipeline(
             prompt=prompts,
             image=init_image,
             mask_image=mask_image,
             num_images_per_prompt=num_images_per_prompt,
+            batch_size=batch_size,
             throughput_warmup_steps=3,
-            num_inference_steps=num_inference_steps,
-            batch_size=4,
         )
 
+        # Check expected number of output images
         self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
-        self.assertGreaterEqual(outputs.throughput, 0.95 * INPAINT_THROUGHPUT_BASELINE_BF16)
 
-
-"""
-Copied from: https://github.com/huggingface/diffusers/blob/v0.26.3/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
-- Modified pipeline to Gaudi pipeline.
-- Modified the get_dummy_components to add the Gaudi pipeline parameters: use_habana, use_hpu_graphs, gaudi_config, bf16_full_eval
-- added test_stable_diffusion_xl_inpaint_no_throughput_regression
-"""
+        # Throughput regression test
+        self.assertGreaterEqual(outputs.throughput, 0.95 * INPAINT_THROUGHPUT_BASELINE_BF16)
 
 
-class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, TestCase):
+class StableDiffusionXLInpaintPipelineTests(PipelineLatentTesterMixin, PipelineTesterMixin, TestCase):
     pipeline_class = GaudiStableDiffusionXLInpaintPipeline
     params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
     batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
@@ -5685,10 +5171,6 @@ def test_attention_slicing_forward_pass(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
-    # TODO(Patrick, Sayak) - skip for now as this requires more refiner tests
-    def test_save_load_optional_components(self):
-        pass
-
     def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self):
         device = "cpu"
         components = self.get_dummy_components()
@@ -5806,166 +5288,6 @@ def new_step(self, *args, **kwargs):
             # Currently cannot support the default HeunDiscreteScheduler
             # assert_run_mixture(steps, 0.33, HeunDiscreteScheduler)
 
-    @slow
-    def test_stable_diffusion_two_xl_mixture_of_denoiser(self):
-        components = self.get_dummy_components()
-        pipe_1 = GaudiStableDiffusionXLInpaintPipeline(**components)
-        pipe_1.unet.set_default_attn_processor()
-        pipe_2 = GaudiStableDiffusionXLInpaintPipeline(**components)
-        pipe_2.unet.set_default_attn_processor()
-
-        def assert_run_mixture(
-            num_steps, split, scheduler_cls_orig, num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps
-        ):
-            inputs = self.get_dummy_inputs()
-            inputs["num_inference_steps"] = num_steps
-
-            class scheduler_cls(scheduler_cls_orig):
-                pass
-
-            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
-            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
-
-            # Let's retrieve the number of timesteps we want to use
-            pipe_1.scheduler.set_timesteps(num_steps)
-            expected_steps = pipe_1.scheduler.timesteps.tolist()
-
-            split_ts = num_train_timesteps - int(round(num_train_timesteps * split))
-
-            if pipe_1.scheduler.order == 2:
-                expected_steps_1 = list(filter(lambda ts: ts >= split_ts, expected_steps))
-                expected_steps_2 = expected_steps_1[-1:] + list(filter(lambda ts: ts < split_ts, expected_steps))
-                expected_steps = expected_steps_1 + expected_steps_2
-            else:
-                expected_steps_1 = list(filter(lambda ts: ts >= split_ts, expected_steps))
-                expected_steps_2 = list(filter(lambda ts: ts < split_ts, expected_steps))
-
-            # now we monkey patch step `done_steps`
-            # list into the step function for testing
-            done_steps = []
-            old_step = copy.copy(scheduler_cls.step)
-
-            def new_step(self, *args, **kwargs):
-                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
-                return old_step(self, *args, **kwargs)
-
-            scheduler_cls.step = new_step
-
-            inputs_1 = {**inputs, **{"denoising_end": split, "output_type": "latent"}}
-            latents = pipe_1(**inputs_1).images[0]
-
-            assert expected_steps_1 == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
-
-            inputs_2 = {**inputs, **{"denoising_start": split, "image": latents}}
-            pipe_2(**inputs_2).images[0]
-
-            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
-            assert expected_steps == done_steps, f"Failure with {scheduler_cls.__name__} and {num_steps} and {split}"
-
-        for steps in [5, 8, 20]:
-            for split in [0.33, 0.49, 0.71]:
-                for scheduler_cls in [
-                    GaudiDDIMScheduler,
-                    GaudiEulerDiscreteScheduler,
-                    GaudiEulerAncestralDiscreteScheduler,
-                    DPMSolverMultistepScheduler,
-                    UniPCMultistepScheduler,
-                    # HeunDiscreteScheduler,
-                ]:
-                    assert_run_mixture(steps, split, scheduler_cls)
-
-    @slow
-    def test_stable_diffusion_three_xl_mixture_of_denoiser(self):
-        components = self.get_dummy_components()
-        pipe_1 = GaudiStableDiffusionXLInpaintPipeline(**components)
-        pipe_1.unet.set_default_attn_processor()
-        pipe_2 = GaudiStableDiffusionXLInpaintPipeline(**components)
-        pipe_2.unet.set_default_attn_processor()
-        pipe_3 = GaudiStableDiffusionXLInpaintPipeline(**components)
-        pipe_3.unet.set_default_attn_processor()
-
-        def assert_run_mixture(
-            num_steps,
-            split_1,
-            split_2,
-            scheduler_cls_orig,
-            num_train_timesteps=pipe_1.scheduler.config.num_train_timesteps,
-        ):
-            inputs = self.get_dummy_inputs()
-            inputs["num_inference_steps"] = num_steps
-
-            class scheduler_cls(scheduler_cls_orig):
-                pass
-
-            pipe_1.scheduler = scheduler_cls.from_config(pipe_1.scheduler.config)
-            pipe_2.scheduler = scheduler_cls.from_config(pipe_2.scheduler.config)
-            pipe_3.scheduler = scheduler_cls.from_config(pipe_3.scheduler.config)
-
-            # Let's retrieve the number of timesteps we want to use
-            pipe_1.scheduler.set_timesteps(num_steps)
-            expected_steps = pipe_1.scheduler.timesteps.tolist()
-
-            split_1_ts = num_train_timesteps - int(round(num_train_timesteps * split_1))
-            split_2_ts = num_train_timesteps - int(round(num_train_timesteps * split_2))
-
-            if pipe_1.scheduler.order == 2:
-                expected_steps_1 = list(filter(lambda ts: ts >= split_1_ts, expected_steps))
-                expected_steps_2 = expected_steps_1[-1:] + list(
-                    filter(lambda ts: ts >= split_2_ts and ts < split_1_ts, expected_steps)
-                )
-                expected_steps_3 = expected_steps_2[-1:] + list(filter(lambda ts: ts < split_2_ts, expected_steps))
-                expected_steps = expected_steps_1 + expected_steps_2 + expected_steps_3
-            else:
-                expected_steps_1 = list(filter(lambda ts: ts >= split_1_ts, expected_steps))
-                expected_steps_2 = list(filter(lambda ts: ts >= split_2_ts and ts < split_1_ts, expected_steps))
-                expected_steps_3 = list(filter(lambda ts: ts < split_2_ts, expected_steps))
-
-            # now we monkey patch step `done_steps`
-            # list into the step function for testing
-            done_steps = []
-            old_step = copy.copy(scheduler_cls.step)
-
-            def new_step(self, *args, **kwargs):
-                done_steps.append(args[1].cpu().item())  # args[1] is always the passed `t`
-                return old_step(self, *args, **kwargs)
-
-            scheduler_cls.step = new_step
-
-            inputs_1 = {**inputs, **{"denoising_end": split_1, "output_type": "latent"}}
-            latents = pipe_1(**inputs_1).images[0]
-
-            assert (
-                expected_steps_1 == done_steps
-            ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}"
-
-            inputs_2 = {
-                **inputs,
-                **{"denoising_start": split_1, "denoising_end": split_2, "image": latents, "output_type": "latent"},
-            }
-            pipe_2(**inputs_2).images[0]
-
-            assert expected_steps_2 == done_steps[len(expected_steps_1) :]
-
-            inputs_3 = {**inputs, **{"denoising_start": split_2, "image": latents}}
-            pipe_3(**inputs_3).images[0]
-
-            assert expected_steps_3 == done_steps[len(expected_steps_1) + len(expected_steps_2) :]
-            assert (
-                expected_steps == done_steps
-            ), f"Failure with {scheduler_cls.__name__} and {num_steps} and {split_1} and {split_2}"
-
-        for steps in [7, 11, 20]:
-            for split_1, split_2 in zip([0.19, 0.32], [0.81, 0.68]):
-                for scheduler_cls in [
-                    GaudiDDIMScheduler,
-                    GaudiEulerDiscreteScheduler,
-                    GaudiEulerAncestralDiscreteScheduler,
-                    DPMSolverMultistepScheduler,
-                    UniPCMultistepScheduler,
-                    # HeunDiscreteScheduler,
-                ]:
-                    assert_run_mixture(steps, split_1, split_2, scheduler_cls)
-
     def test_stable_diffusion_xl_multi_prompts(self):
         device = "cpu"
         components = self.get_dummy_components()
@@ -6185,32 +5507,34 @@ def test_stable_diffusion_xl_inpaint_no_throughput_regression(self):
         )
 
         prompts = [
-            "a black cat with glowing eyes, cute, adorable, disney, pixar, highly detailed, 8k",
             "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k",
         ]
         model_name = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
-        num_images_per_prompt = 10
-        num_inference_steps = 10
-        init_kwargs = {
-            "use_habana": True,
-            "use_hpu_graphs": True,
-            "gaudi_config": "Habana/stable-diffusion",
-            "torch_dtype": torch.bfloat16,
-        }
-        sdi_pipe = GaudiStableDiffusionXLInpaintPipeline.from_pretrained(model_name, **init_kwargs)
+        num_images_per_prompt = 12
+        batch_size = 4
+        pipeline = GaudiStableDiffusionXLInpaintPipeline.from_pretrained(
+            model_name,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config="Habana/stable-diffusion",
+            torch_dtype=torch.bfloat16,
+            sdp_on_bf16=True,
+        )
 
-        set_seed(0)
-        outputs = sdi_pipe(
+        set_seed(27)
+        outputs = pipeline(
             prompt=prompts,
             image=init_image,
             mask_image=mask_image,
             num_images_per_prompt=num_images_per_prompt,
+            batch_size=batch_size,
             throughput_warmup_steps=3,
-            num_inference_steps=num_inference_steps,
-            batch_size=4,
         )
 
+        # Check expected number of output images
         self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
+
+        # Throughput regression test
         self.assertGreaterEqual(outputs.throughput, 0.95 * INPAINT_XL_THROUGHPUT_BASELINE_BF16)
 
 
@@ -6356,6 +5680,7 @@ def test_no_throughput_regression_bf16(self):
             use_habana=True,
             use_hpu_graphs=True,
             gaudi_config=gaudi_config,
+            sdp_on_bf16=True,
         )
         outputs = pipe(batch_size=batch_size)
         self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16)
@@ -6516,26 +5841,30 @@ def test_flux_prompt_embeds(self):
     @slow
     @pytest.mark.skipif(not IS_GAUDI2, reason="does not fit into Gaudi1 memory")
     def test_flux_inference(self):
-        repo_id = "black-forest-labs/FLUX.1-schnell"
-
-        pipe = self.pipeline_class.from_pretrained(
-            repo_id,
-            torch_dtype=torch.bfloat16,
+        prompts = [
+            "A cat holding a sign that says hello world",
+        ]
+        num_images_per_prompt = 10
+        batch_size = 1
+        model_name = "black-forest-labs/FLUX.1-schnell"
+        pipeline = GaudiFluxPipeline.from_pretrained(
+            model_name,
             use_habana=True,
             use_hpu_graphs=True,
             gaudi_config="Habana/stable-diffusion",
+            sdp_on_bf16=True,
         )
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-
-        outputs = pipe(
-            prompt="A photo of a cat",
-            num_inference_steps=5,
-            guidance_scale=5.0,
-            output_type="np",
-            generator=generator,
+        set_seed(27)
+        outputs = pipeline(
+            prompt=prompts,
+            num_images_per_prompt=num_images_per_prompt,
+            batch_size=batch_size,
+            num_inference_steps=4,
         )
 
+        # Check expected number of output images
+        self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
+
         # Check expected performance of FLUX.1 schnell model
         self.assertGreaterEqual(outputs.throughput, 0.95 * FLUX_THROUGHPUT)
 
@@ -6676,3 +6005,36 @@ def test_flux_prompt_embeds(self):
 
         max_diff = np.abs(output_with_prompt - output_with_embeds).max()
         assert max_diff < 1e-4
+
+    @slow
+    @check_gated_model_access("black-forest-labs/FLUX.1-dev")
+    @pytest.mark.skipif(not IS_GAUDI2, reason="does not fit into Gaudi1 memory")
+    def test_flux_img2img_inference(self):
+        repo_id = "black-forest-labs/FLUX.1-dev"
+        image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"
+        image = PIL.Image.open(requests.get(image_path, stream=True).raw)
+        image = PIL.ImageOps.exif_transpose(image)
+        image = image.convert("RGB")
+
+        pipe = self.pipeline_class.from_pretrained(
+            repo_id,
+            torch_dtype=torch.bfloat16,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config="Habana/stable-diffusion",
+            sdp_on_bf16=True,
+        )
+
+        outputs = pipe(
+            image=image,
+            prompt="cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k",
+            num_inference_steps=30,
+            guidance_scale=3.5,
+            strength=0.9,
+            batch_size=1,
+            num_images_per_prompt=10,
+            output_type="np",
+        )
+
+        # Check expected performance of FLUX.1 dev img-to-img model
+        self.assertGreaterEqual(outputs.throughput, 0.95 * FLUX_DEV_I2I_THROUGHPUT)
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
index 27dd1b75c2..308a3fe242 100644
--- a/tests/test_encoder_decoder.py
+++ b/tests/test_encoder_decoder.py
@@ -16,16 +16,13 @@
     MODELS_TO_TEST = {
         "summarization": {
             "bf16": [
-                ("facebook/bart-large-cnn", "Habana/bart", 3.673, 28.9801, 2, 2),
-                ("t5-3b", "Habana/t5", 2.955, 21.8877, 2, 1),
+                ("facebook/bart-large-cnn", "Habana/bart", 4.339, 28.9801, 2, 2),
+                ("t5-3b", "Habana/t5", 3.848, 21.8877, 2, 1),
             ],
         },
         "translation": {
             "bf16": [
-                ("Babelscape/mrebel-large", "Habana/t5", 1.323, 0.1509, 2, 1),
-                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.815, 0.8132, 2, 1),
-                ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.284, 1.2599, 2, 1),
-                ("t5-small", "Habana/t5", 11.164, 11.7277, 2, 1),
+                ("t5-small", "Habana/t5", 11.648, 11.7277, 2, 1),
             ],
         },
     }
@@ -40,9 +37,6 @@
         },
         "translation": {
             "bf16": [
-                ("Babelscape/mrebel-large", "Habana/t5", 0.995, 0.1784, 2, 1),
-                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.409, 0.7995, 2, 1),
-                ("facebook/nllb-200-distilled-600M", "Habana/t5", 0.998, 1.2457, 2, 1),
                 ("t5-small", "Habana/t5", 9.188, 11.6126, 2, 1),
             ],
         },
@@ -152,6 +146,7 @@ def _test_text_summarization(
             "--use_hpu_graphs_for_inference",
             "--use_lazy_mode",
             "--max_predict_samples 200",
+            "--throughput_warmup_steps 3",
         ]
 
         command = self._build_command(
@@ -189,7 +184,8 @@ def _test_text_translation(
             "--do_predict",
             "--source_lang en",
             "--target_lang ro",
-            '--source_prefix "translate English to Romanian: "' "--dataset_name wmt16",
+            '--source_prefix "translate English to Romanian: "',
+            "--dataset_name wmt16",
             "--dataset_config_name ro-en",
             f"--per_device_eval_batch_size {batch_size}",
             f"--generation_num_beams {num_beams}",
@@ -201,11 +197,15 @@ def _test_text_translation(
             "--ignore_pad_token_for_loss False",
             "--pad_to_max_length",
             "--max_predict_samples 200",
+            "--throughput_warmup_steps 3",
         ]
 
         if "opus-mt-zh-en" in model_name:
             command_args.append("--max_source_length 512")
 
+        if "Babelscape/mrebel-large" in model_name or "nllb-200-distilled-600M" in model_name:
+            command_args.append("--sdp_on_bf16")
+
         command = self._build_command(
             task=task,
             deepspeed=deepspeed,
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 20f26f9012..a1271d4da1 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -528,6 +528,34 @@ def test(self):
                 env_variables["PT_HPU_LAZY_MODE"] = "0"
                 env_variables["PT_ENABLE_INT64_SUPPORT"] = "1"
 
+            if self.EXAMPLE_NAME == "run_audio_classification":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_image_classification":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_glue":
+                if model_name == "bert-large-uncased-whole-word-masking":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_qa":
+                if model_name == "bert-large-uncased-whole-word-masking" or model_name == "albert-large-v2":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_bridgetower":
+                if model_name == "BridgeTower/bridgetower-large-itm-mlm-itc":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_speech_recognition_seq2seq":
+                if model_name == "openai/whisper-small":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_clip":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_image2text_lora_finetune":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
             with TemporaryDirectory() as tmp_dir:
                 cmd_line = self._create_command_line(
                     multi_card,
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 6335f28ebf..180a2bb3f9 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -97,6 +97,7 @@ def _test_fsdp(
             f"--gaudi_config_name {gaudi_config}",
             "--throughput_warmup_steps 100",
             "--do_eval",
+            "--sdp_on_bf16",
         ]
     else:
         command += [
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
index c73d4d0565..538ca8c182 100644
--- a/tests/test_image_to_text_example.py
+++ b/tests/test_image_to_text_example.py
@@ -14,8 +14,8 @@
     # Gaudi2 CI baselines
     MODELS_TO_TEST = {
         "bf16": [
-            ("llava-hf/llava-1.5-7b-hf", 1, 77.98733740859008),
-            ("llava-hf/llava-1.5-13b-hf", 1, 48.54364937033955),
+            # ("llava-hf/llava-1.5-7b-hf", 1, 77.98733740859008),
+            # ("llava-hf/llava-1.5-13b-hf", 1, 48.54364937033955),
             ("llava-hf/llava-v1.6-mistral-7b-hf", 1, 33.17984878151546),
             ("llava-hf/llava-v1.6-vicuna-7b-hf", 1, 35.00608681379742),
             ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 23.527610042925),
@@ -23,10 +23,12 @@
             ("HuggingFaceM4/idefics2-8b", 1, 21.89944593215077),
             ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 18.974541922240313),
             ("tiiuae/falcon-11B-vlm", 1, 23.69260849957278),
+            ("Qwen/Qwen2-VL-2B-Instruct", 1, 28.755882208438422),
+            ("Qwen/Qwen2-VL-7B-Instruct", 1, 19.32562189532818),
         ],
         "fp8": [
-            ("llava-hf/llava-1.5-7b-hf", 1, 98.72578382705062),
-            ("llava-hf/llava-1.5-13b-hf", 1, 67.20488222876344),
+            # ("llava-hf/llava-1.5-7b-hf", 1, 98.72578382705062),
+            # ("llava-hf/llava-1.5-13b-hf", 1, 67.20488222876344),
             ("llava-hf/llava-v1.6-mistral-7b-hf", 1, 45.011551008367084),
             ("llava-hf/llava-v1.6-vicuna-7b-hf", 1, 45.18544502949674),
             ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 30.9535718774675),
@@ -67,6 +69,11 @@ def _test_image_to_text(
         "--use_hpu_graphs",
     ]
 
+    if "meta-llama/Llama-3.2-11B-Vision-Instruct" in model_name or "tiiuae/falcon-11B-vlm" in model_name:
+        command += [
+            "--sdp_on_bf16",
+        ]
+
     command.append("--bf16")
     command.append("--sdp_on_bf16")
 
diff --git a/tests/test_object_detection.py b/tests/test_object_detection.py
index d3c81ef7af..014a7704da 100644
--- a/tests/test_object_detection.py
+++ b/tests/test_object_detection.py
@@ -15,7 +15,7 @@
 
 import os
 import time
-from unittest import TestCase
+from unittest import TestCase, skipIf
 
 import habana_frameworks.torch as ht
 import numpy as np
@@ -39,11 +39,30 @@
     LATENCY_DETR_BF16_GRAPH_BASELINE = 14.5
 
 
+def is_eager_mode():
+    return os.environ.get("PT_HPU_LAZY_MODE", "1") == "0"
+
+
 class GaudiDETRTester(TestCase):
     """
     Tests for Object Detection - DETR
     """
 
+    def get_expected_loc(self, mode="default"):
+        expected_location_def = np.array([344.0622, 24.8543, 640.3398, 373.7401])
+        expected_location_ac = np.array([342, 25.25, 636, 376])
+        modeCheck = True if mode in ["default", "autocast"] else False
+        self.assertEqual(modeCheck, True)
+        if mode == "default":
+            return expected_location_def
+        else:
+            return expected_location_ac
+
+    def get_expected_num_boxes(self):
+        # For image http://images.cocodataset.org/val2017/000000039769.jpg
+        # the expected result is 5
+        return 5
+
     def prepare_model_and_processor(self):
         model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-101").to("hpu")
         model = model.eval()
@@ -62,9 +81,9 @@ def test_inference_default(self):
         target_sizes = torch.Tensor([image.size[::-1]])
         results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
         boxes = results["boxes"]
-        self.assertEqual(len(boxes), 5)
-        expected_location = np.array([344.0622, 24.8543, 640.3398, 373.7401])
-        self.assertLess(np.abs(boxes[0].cpu().detach().numpy() - expected_location).max(), 1)
+        self.assertEqual(len(boxes), self.get_expected_num_boxes())
+        expected_loc = self.get_expected_loc()
+        self.assertLess(np.abs(boxes[0].cpu().detach().numpy() - expected_loc).max(), 1)
 
     def test_inference_autocast(self):
         model, processor = self.prepare_model_and_processor()
@@ -76,10 +95,11 @@ def test_inference_autocast(self):
             target_sizes = torch.Tensor([image.size[::-1]])
             results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
             boxes = results["boxes"]
-            self.assertEqual(len(boxes), 5)
-            expected_location = np.array([342, 25.25, 636, 376])
+            self.assertEqual(len(boxes), self.get_expected_num_boxes())
+            expected_location = self.get_expected_loc(mode="autocast")
             self.assertLess(np.abs(boxes[0].to(torch.float32).cpu().detach().numpy() - expected_location).max(), 5)
 
+    @skipIf(is_eager_mode(), reason="ht.hpu.wrap_in_hpu_graph is supported only in lazy mode")
     def test_inference_hpu_graphs(self):
         model, processor = self.prepare_model_and_processor()
         image = self.prepare_data()
@@ -92,9 +112,10 @@ def test_inference_hpu_graphs(self):
         results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
         boxes = results[0]["boxes"]
         self.assertEqual(len(boxes), 5)
-        expected_location = np.array([344.0622, 24.8543, 640.3398, 373.7401])
+        expected_location = self.get_expected_loc()
         self.assertLess(np.abs(boxes[0].to(torch.float32).cpu().detach().numpy() - expected_location).max(), 1)
 
+    @skipIf(is_eager_mode(), reason="ht.hpu.wrap_in_hpu_graph is supported only in lazy mode")
     def test_no_latency_regression_autocast(self):
         warmup = 3
         iterations = 10
@@ -121,3 +142,39 @@ def test_no_latency_regression_autocast(self):
 
         latency = total_model_time * 1000 / iterations  # in terms of ms
         self.assertLessEqual(latency, TIME_PERF_FACTOR * LATENCY_DETR_BF16_GRAPH_BASELINE)
+
+
+class GaudiDetrResnet50_Tester(GaudiDETRTester):
+    """
+    Tests for Custom Configuration of Detr-Resnet50 model
+    """
+
+    def get_num_labels(self):
+        # COCO-2017, the dataset the test is based on uses 91 labels
+        # TODO: automate this from dataset info. For now, just return this number
+        return 91
+
+    def prepare_model_and_processor(self):
+        model = DetrForObjectDetection.from_pretrained(
+            "facebook/detr-resnet-50",
+            revision="no_timm",
+            num_labels=self.get_num_labels(),
+            ignore_mismatched_sizes=True,
+        )
+        model = model.to("hpu")
+        model = model.eval()
+        processor = AutoProcessor.from_pretrained("facebook/detr-resnet-50")
+        return model, processor
+
+    def get_expected_loc(self, mode="default"):
+        # Reference: first box co-ordinates listed in model card:
+        # https://huggingface.co/facebook/detr-resnet-50#how-to-use
+        expected_location = np.array([40.16, 70.81, 175.55, 117.98])
+        modeCheck = True if mode in ["default", "autocast"] else False
+        self.assertEqual(modeCheck, True)
+        return expected_location
+
+    def get_expected_num_boxes(self):
+        # For image http://images.cocodataset.org/val2017/000000039769.jpg
+        # the expected result is 5
+        return 5
diff --git a/tests/test_peft_inference.py b/tests/test_peft_inference.py
index 05e0058515..55f448091f 100644
--- a/tests/test_peft_inference.py
+++ b/tests/test_peft_inference.py
@@ -45,6 +45,7 @@ def _text_generation(self, model, tokenizer, extra_kwargs=None):
             "hpu_graphs": True,
             "max_new_tokens": 128,
             "ignore_eos": True,
+            "do_sample": False,
         }
         if extra_kwargs:
             generate_kwargs.update(extra_kwargs)
@@ -55,6 +56,7 @@ def _text_generation(self, model, tokenizer, extra_kwargs=None):
             device="hpu",
         )
         output = generator("Hello, Boy", **generate_kwargs)
+        torch.hpu.synchronize()
         return output[0]["generated_text"]
 
     def _test_text_generation(self, model_name_or_path, peft_method):
@@ -96,14 +98,14 @@ def _test_text_generation(self, model_name_or_path, peft_method):
         model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation
 
         result1 = self._text_generation(model, tokenizer)
-        if peft_method != "llama-adapter":
-            assert result != result1
 
         result2 = self._text_generation(model, tokenizer, extra_kwargs={"reuse_cache": True})
-        assert result1 == result2
 
         result3 = self._text_generation(model, tokenizer, extra_kwargs={"bucket_size": 10})
-        assert result1 == result3
+        model = model.to("cpu")
+        if peft_method != "llama-adapter":
+            assert result != result1
+        assert result2 == result3
 
     @pytest.mark.parametrize("model, method", TEST_CASES)
     def test_text_generation_llama(self, model, method):
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index ad2195bb75..c217ee9a79 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -40,22 +40,25 @@
             ("codellama/CodeLlama-34b-hf", 1, True, 32.644, False),
             ("bigcode/starcoder2-3b", 1, False, 261.07213776344133, True),
             ("adept/persimmon-8b-base", 4, False, 366.73968820698406, False),
-            ("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209, False),
+            # ("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209, False),
             ("google/gemma-7b", 1, False, 109.70751574382221, True),
             ("google/gemma-2-9b", 1, False, 92.302359446567, True),
+            ("google/gemma-2-27b", 1, False, 36.578709544111, True),
             ("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605, False),
-            ("Deci/DeciLM-7B", 1, False, 120, False),
+            # ("Deci/DeciLM-7B", 1, False, 115, False),
             ("Qwen/Qwen2-7B", 256, False, 8870.945160540245, True),
             ("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395, False),
-            ("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122, False),
-            ("facebook/xglm-1.7B", 1, False, 357.46365062825083, False),
-            ("CohereForAI/c4ai-command-r-v01", 1, False, 29.50315234651154, False),
+            # ("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122, False),
+            # ("facebook/xglm-1.7B", 1, False, 357.46365062825083, False),
+            # ("CohereForAI/c4ai-command-r-v01", 1, False, 29.50315234651154, False),
             ("tiiuae/falcon-mamba-7b", 1, False, 47.1464839567739, False),
             ("openbmb/MiniCPM3-4B", 1, False, 65.116, False),
             ("baichuan-inc/Baichuan2-7B-Chat", 1, True, 108, False),
             ("baichuan-inc/Baichuan2-13B-Chat", 1, False, 66, False),
             ("deepseek-ai/DeepSeek-V2-Lite", 1, False, 35, False),
+            ("THUDM/chatglm2-6b", 1, True, 150, False),
             ("THUDM/chatglm3-6b", 1, True, 150, False),
+            ("Qwen/Qwen2.5-7B", 4, False, 490, False),
         ],
         "fp8": [
             ("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68),
@@ -68,25 +71,30 @@
             ("meta-llama/Llama-2-70b-hf", 4, 207, False, 2048, 128, 568.5),
             ("meta-llama/Llama-2-70b-hf", 8, 172, False, 2048, 2048, 4656.2),
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 896, True, 128, 128, 17068.965283763682),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 128, 2048, 6979.225194247115),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 2048, 128, 1681.4401450088983),
+            # ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 128, 2048, 6979.225194247115),
+            # ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 2048, 128, 1681.4401450088983),
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 44, True, 2048, 2048, 3393.149396451692),
             ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 40.94),
             ("mistralai/Mixtral-8x7B-v0.1", 2, 768, True, 128, 128, 3428.65),
-            ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 128, 2048, 2570.34),
-            ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 2048, 128, 379.03),
+            # ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 128, 2048, 2570.34),
+            # ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 2048, 128, 379.03),
             ("mistralai/Mixtral-8x7B-v0.1", 2, 48, True, 2048, 2048, 1147.50),
             ("microsoft/phi-2", 1, 1, True, 128, 128, 254.08932787178165),
         ],
         "load_quantized_model_with_autogptq": [
             ("TheBloke/Llama-2-7b-Chat-GPTQ", 1, 10, False, 128, 2048, 456.7),
         ],
+        "load_quantized_model_with_autoawq": [
+            ("TheBloke/Llama-2-7b-Chat-AWQ", 1, 10, False, 128, 2048, 456.7),
+        ],
         "deepspeed": [
             ("bigscience/bloomz", 8, 1, 36.77314954096159),
-            ("meta-llama/Llama-2-70b-hf", 8, 1, 64.10514998902435),
+            # ("meta-llama/Llama-2-70b-hf", 8, 1, 64.10514998902435),
             ("meta-llama/Meta-Llama-3-70B-Instruct", 8, 1, 64),
             ("facebook/opt-66b", 2, 1, 28.48069266504111),
             ("google/gemma-2-9b", 8, 1, 110.12610917383735),
+            ("Qwen/Qwen2.5-72B", 2, 1, 26),
+            ("google/gemma-2-27b", 8, 1, 87.578709544111),
         ],
         "torch_compile": [
             ("meta-llama/Llama-2-7b-hf", 102.27823420713148),
@@ -109,6 +117,7 @@
         "bigcode/starcoder2-3b": 'def print_hello_world():\n    print("Hello World")\n\ndef print_hello_world_with_name(name):\n    print("Hello World, " + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n    print("Hello World, " + name + ", " + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n    print("Hello',
         "google/gemma-7b": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models. DeepSpeed is designed to be scalable, and it can be used to train models on a single machine or on a cluster of machines. DeepSpeed is designed to be efficient,",
         "google/gemma-2-9b": "DeepSpeed is a machine learning framework that enables training of large-scale deep learning models on a single GPU or across multiple GPUs. It is designed to be easy to use and highly scalable, making it a powerful tool for researchers and practitioners working with large-scale deep learning models.\n\nDeepSpeed is built on top of PyTorch, a popular deep learning framework, and provides a set of tools and libraries that make it easy to train large-scale models. It includes features such as zero-shot inference, which allows models to be",
+        "google/gemma-2-27b": "DeepSpeed is a machine learning framework that enables you to train models with trillions of parameters and beyond, using model parallelism to partition large models over multiple GPUs.\n\nThe following is a brief introduction to the DeepSpeed model parallel training.\n\n<h2>1. Introduction</h2>\n\nThe DeepSpeed model parallel training is a simple and effective way to train large models. It is a framework that enables you to train models with trillions of parameters and beyond.\n\nDeepSpeed is a distributed deep learning optimization toolkit that makes it easy and efficient",
         "meta-llama/Llama-2-7b-hf": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex",
         "mistralai/Mistral-7B-v0.1": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system",
         "mistralai/Mixtral-8x7B-v0.1": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed",
@@ -139,6 +148,7 @@
         ],
         "fp8": [],
         "load_quantized_model_with_autogptq": [],
+        "load_quantized_model_with_autoawq": [],
         "deepspeed": [
             ("bigscience/bloomz-7b1", 8, 1, 31.994268212011505),
         ],
@@ -164,6 +174,7 @@ def _test_text_generation(
     torch_compile: bool = False,
     fp8: bool = False,
     load_quantized_model_with_autogptq: bool = False,
+    load_quantized_model_with_autoawq: bool = False,
     max_input_tokens: int = 0,
     max_output_tokens: int = 100,
     parallel_strategy: str = None,
@@ -218,6 +229,12 @@ def _test_text_generation(
     if "gemma" in model_name.lower():
         command += ["--use_flash_attention"]
 
+    if "decilm" in model_name.lower():
+        command += ["--sdp_on_bf16"]
+
+    if "mamba-130m-hf" in model_name.lower():
+        command += ["--sdp_on_bf16"]
+
     if (reuse_cache or torch_compile) and not parallel_strategy == "tp" and not is_starcoder_first_gen_model:
         command += ["--reuse_cache"]
 
@@ -291,6 +308,8 @@ def _test_text_generation(
         ]
     if load_quantized_model_with_autogptq:
         command += ["--load_quantized_model_with_autogptq"]
+    if load_quantized_model_with_autoawq:
+        command += ["--load_quantized_model_with_autoawq"]
     if parallel_strategy is not None:
         command += [
             f"--parallel_strategy={parallel_strategy}",
@@ -363,9 +382,9 @@ def _test_text_generation(
 
         # Verify output for 1 HPU, BF16
         if check_output:
-            assert (
-                model_name in MODEL_OUTPUTS
-            ), f"Failed functional testing, missing expected output in MODEL_OUTPUTS for model {model_name}"
+            assert model_name in MODEL_OUTPUTS, (
+                f"Failed functional testing, missing expected output in MODEL_OUTPUTS for model {model_name}"
+            )
             expected_output = MODEL_OUTPUTS[model_name]
             assert results["output"][0][0] == expected_output
 
@@ -384,6 +403,7 @@ def test_text_generation_bf16_1x(
     )
 
 
+@pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize(
     "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline", MODELS_TO_TEST["fp8"]
 )
@@ -412,6 +432,7 @@ def test_text_generation_fp8(
     )
 
 
+@pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize(
     "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline",
     MODELS_TO_TEST["load_quantized_model_with_autogptq"],
@@ -442,22 +463,56 @@ def test_text_generation_gptq(
     )
 
 
+@pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
+@pytest.mark.parametrize(
+    "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline",
+    MODELS_TO_TEST["load_quantized_model_with_autoawq"],
+)
+def test_text_generation_awq(
+    model_name: str,
+    baseline: float,
+    world_size: int,
+    batch_size: int,
+    reuse_cache: bool,
+    input_len: int,
+    output_len: int,
+    token: str,
+):
+    deepspeed = True if world_size > 1 else False
+    _test_text_generation(
+        model_name,
+        baseline,
+        token,
+        deepspeed=deepspeed,
+        world_size=world_size,
+        fp8=False,
+        load_quantized_model_with_autoawq=True,
+        batch_size=batch_size,
+        reuse_cache=reuse_cache,
+        max_input_tokens=input_len,
+        max_output_tokens=output_len,
+    )
+
+
 @pytest.mark.parametrize("model_name,  world_size, batch_size, baseline", MODELS_TO_TEST["deepspeed"])
 def test_text_generation_deepspeed(model_name: str, baseline: float, world_size: int, batch_size: int, token: str):
     _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, batch_size=batch_size)
 
 
+@pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["torch_compile"])
 def test_text_generation_torch_compile(model_name: str, baseline: float, token: str):
     _test_text_generation(model_name, baseline, token, torch_compile=True)
 
 
+@pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["torch_compile_distributed"])
 def test_text_generation_torch_compile_distributed(model_name: str, baseline: float, token: str):
     world_size = 8
     _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, torch_compile=True)
 
 
+@pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["distributed_tp"])
 def test_text_generation_distributed_tp(model_name: str, baseline: float, token: str):
     world_size = 8
@@ -480,6 +535,7 @@ def test_text_generation_contrastive_search(
     _test_text_generation(model_name, baseline, token, batch_size, reuse_cache, contrastive_search=True)
 
 
+@pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline", MODELS_TO_TEST["beam_search"])
 def test_text_generation_beam_search(model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str):
     _test_text_generation(model_name, baseline, token, batch_size, reuse_cache, num_beams=3)
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index eddb82b500..61ff958477 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -540,7 +540,7 @@ def convert_to_sharded_checkpoint(self, folder, save_safe=True, load_safe=True):
         keys = list(state_dict.keys())
 
         shard_files = [
-            shard_name.replace(f".{extension}", f"-{idx+1:05d}-of-{len(keys):05d}.{extension}")
+            shard_name.replace(f".{extension}", f"-{idx + 1:05d}-of-{len(keys):05d}.{extension}")
             for idx in range(len(keys))
         ]
         index = {"metadata": {}, "weight_map": {key: shard_files[i] for i, key in enumerate(keys)}}
@@ -1698,9 +1698,9 @@ def test_load_best_model_with_save(self):
             )
             trainer.train()
             # Check that we have the last known step:
-            assert os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
-            ), f"Could not find checkpoint-{trainer.state.max_steps}"
+            assert os.path.exists(os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")), (
+                f"Could not find checkpoint-{trainer.state.max_steps}"
+            )
             # And then check the last step
             assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
 
diff --git a/tests/test_video_llava.py b/tests/test_video_llava.py
new file mode 100644
index 0000000000..30c42b0cd8
--- /dev/null
+++ b/tests/test_video_llava.py
@@ -0,0 +1,77 @@
+import json
+import os
+import re
+import subprocess
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from .test_examples import TIME_PERF_FACTOR
+
+
+if os.environ.get("GAUDI2_CI", "0") == "1":
+    # Gaudi2 CI baselines
+    MODELS_TO_TEST = {
+        "bf16": [
+            ("LanguageBind/Video-LLaVA-7B-hf", 27.72902536827787),
+        ],
+    }
+else:
+    # Gaudi1 CI baselines
+    MODELS_TO_TEST = {
+        "bf16": [
+            ("LanguageBind/Video-LLaVA-7B-hf", 9.22975629675865),
+        ],
+    }
+
+
+def _install_requirements():
+    PATH_TO_EXAMPLE_DIR = Path(__file__).resolve().parent.parent / "examples"
+    cmd_line = f"pip install -r {PATH_TO_EXAMPLE_DIR / 'video-comprehension' / 'requirements.txt'}".split()
+    p = subprocess.Popen(cmd_line)
+    return_code = p.wait()
+    assert return_code == 0
+
+
+def _test_video_llava(model_name: str, baseline: float):
+    _install_requirements()
+    command = ["python3"]
+    path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
+    env_variables = os.environ.copy()
+
+    command += [
+        f"{path_to_example_dir / 'video-comprehension' / 'run_example.py'}",
+        f"--model_name_or_path {model_name}",
+        "--bf16",
+        "--use_hpu_graphs",
+    ]
+
+    with TemporaryDirectory() as tmp_dir:
+        command.append(f"--output_dir {tmp_dir}")
+        print(f"\n\nCommand to test: {' '.join(command)}\n")
+
+        pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
+        command = [x for y in command for x in re.split(pattern, y) if x]
+
+        proc = subprocess.run(command, env=env_variables)
+
+        # Ensure the run finished without any issue
+        # Use try-except to avoid logging the token if used
+        try:
+            assert proc.returncode == 0
+        except AssertionError as e:
+            if "'--token', 'hf_" in e.args[0]:
+                e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
+            raise
+
+        with open(Path(tmp_dir) / "results.json") as fp:
+            results = json.load(fp)
+
+        # Ensure performance requirements (throughput) are met
+        assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline
+
+
+@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["bf16"])
+def test_video_llava_bf16(model_name: str, baseline: float):
+    _test_video_llava(model_name, baseline)
diff --git a/tests/transformers/tests/generation/test_framework_agnostic.py b/tests/transformers/tests/generation/test_framework_agnostic.py
index 906a90a95a..7fcc4de752 100644
--- a/tests/transformers/tests/generation/test_framework_agnostic.py
+++ b/tests/transformers/tests/generation/test_framework_agnostic.py
@@ -3,12 +3,8 @@
 """
 
 import numpy as np
-import pytest
 from transformers import AutoTokenizer
-from transformers.testing_utils import slow
-
-
-torch_device = "hpu"
+from transformers.testing_utils import slow, torch_device
 
 
 class GenerationIntegrationTestsMixin:
@@ -50,8 +46,6 @@ def test_validate_generation_inputs(self):
         valid_model_kwargs = {"attention_mask": create_tensor_fn(np.zeros_like(input_ids))}
         model.generate(input_ids, **valid_model_kwargs)
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_custom_logits_processor(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
         logits_processor_list_cls = self.framework_dependent_parameters["LogitsProcessorList"]
@@ -72,8 +66,6 @@ def test_custom_logits_processor(self):
         bart_model.config.min_length = None
         bart_model.generate(input_ids, logits_processor=logits_processor)
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_max_new_tokens_encoder_decoder(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
         return_tensors = self.framework_dependent_parameters["return_tensors"]
@@ -230,8 +222,6 @@ def test_transition_scores_greedy_search_normalized(self):
         )
         self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_transition_scores_beam_search_encoder_decoder(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
         return_tensors = self.framework_dependent_parameters["return_tensors"]
@@ -267,8 +257,6 @@ def test_transition_scores_beam_search_encoder_decoder(self):
 
         self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
         return_tensors = self.framework_dependent_parameters["return_tensors"]
@@ -303,8 +291,6 @@ def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
 
         self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_transition_scores_beam_search_decoder_only(self):
         model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
         return_tensors = self.framework_dependent_parameters["return_tensors"]
@@ -342,8 +328,6 @@ def test_transition_scores_beam_search_decoder_only(self):
 
         self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_transition_scores_beam_sample_encoder_decoder(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
         return_tensors = self.framework_dependent_parameters["return_tensors"]
@@ -381,7 +365,6 @@ def test_transition_scores_beam_sample_encoder_decoder(self):
         self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
 
     @slow
-    @pytest.mark.skip("Not Implemented: sequence_scores is not implemented for static_shapes")
     def test_transition_scores_early_stopping(self):
         # This is an aggressive test that makes sure that `beam_search's`
         # transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
@@ -417,8 +400,6 @@ def test_transition_scores_early_stopping(self):
 
         self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_encoder_decoder_generate_attention_mask(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
         return_tensors = self.framework_dependent_parameters["return_tensors"]
@@ -520,8 +501,6 @@ def test_generate_too_many_encoder_kwargs(self):
         with self.assertRaises(ValueError):
             model.generate(input_ids=input_ids, inputs_embeds=input_ids)
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_generate_input_features_as_encoder_kwarg(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
         floats_tensor = self.framework_dependent_parameters["floats_tensor"]
@@ -563,8 +542,6 @@ def test_generate_pixel_values_as_encoder_kwarg(self):
         self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
         self.assertEqual(output_sequences.shape, (2, 5))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_generate_encoder_outputs_attention_mask(self):
         model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
         floats_tensor = self.framework_dependent_parameters["floats_tensor"]
@@ -599,6 +576,7 @@ def test_eos_token_id_int_and_list_greedy_search(self):
             "do_sample": False,
             "num_beams": 1,
         }
+        expectation = 13
 
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         text = """Hello, my dog is cute and"""
@@ -608,7 +586,6 @@ def test_eos_token_id_int_and_list_greedy_search(self):
             model = model.to(torch_device)
             tokens = tokens.to(torch_device)
 
-        expectation = model.config.max_length  # static shape should give max_length
         eos_token_id = 873
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
         self.assertTrue(expectation == len(generated_tokens[0]))
@@ -628,6 +605,7 @@ def test_eos_token_id_int_and_list_contrastive_search(self):
             "penalty_alpha": 0.6,
             "top_k": 4,
         }
+        expectation = 17
 
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         text = """Hello, my dog is cute and"""
@@ -637,7 +615,6 @@ def test_eos_token_id_int_and_list_contrastive_search(self):
             model = model.to(torch_device)
             tokens = tokens.to(torch_device)
 
-        expectation = model.config.max_length  # static shape should give max_length
         eos_token_id = 225
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
         self.assertTrue(expectation == len(generated_tokens[0]))
@@ -646,8 +623,6 @@ def test_eos_token_id_int_and_list_contrastive_search(self):
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
         self.assertTrue(expectation == len(generated_tokens[0]))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_eos_token_id_int_and_list_beam_search(self):
         model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
         return_tensors = self.framework_dependent_parameters["return_tensors"]
@@ -673,10 +648,7 @@ def test_eos_token_id_int_and_list_beam_search(self):
         padded_correct_condition = expectation < len(generated_tokens[0]) and all(
             token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
         )
-        static_shape_condition = expectation < len(generated_tokens[0]) and all(
-            token == eos_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition or static_shape_condition)
+        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
 
         eos_token_id = [873, 198]
         generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
@@ -684,13 +656,8 @@ def test_eos_token_id_int_and_list_beam_search(self):
         padded_correct_condition = expectation < len(generated_tokens[0]) and all(
             token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
         )
-        static_shape_condition = expectation < len(generated_tokens[0]) and all(
-            token in eos_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition or static_shape_condition)
+        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
     def test_generate_vision2text_conditioning(self):
         model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
         floats_tensor = self.framework_dependent_parameters["floats_tensor"]
diff --git a/tests/transformers/tests/generation/test_utils.py b/tests/transformers/tests/generation/test_utils.py
index 954bcd14d5..512935e9dd 100644
--- a/tests/transformers/tests/generation/test_utils.py
+++ b/tests/transformers/tests/generation/test_utils.py
@@ -14,27 +14,14 @@
 # limitations under the License.
 
 
-import copy
 import inspect
-import tempfile
 import unittest
 import warnings
 
 import numpy as np
 import pytest
-from parameterized import parameterized
-from transformers import is_torch_available, pipeline, set_seed
-from transformers.testing_utils import (
-    is_flaky,
-    require_accelerate,
-    require_auto_gptq,
-    require_quanto,
-    require_torch,
-    require_torch_gpu,
-    require_torch_multi_accelerator,
-    require_torch_multi_gpu,
-    slow,
-)
+from transformers import is_torch_available, pipeline
+from transformers.testing_utils import require_torch, slow
 
 from optimum.habana.checkpoint_utils import model_is_optimized
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
@@ -45,50 +32,54 @@
 
 if is_torch_available():
     import torch
-    import torch.nn.functional as F
     from transformers import (
         AutoModelForCausalLM,
         AutoModelForSeq2SeqLM,
         AutoModelForSpeechSeq2Seq,
         AutoModelForVision2Seq,
-        AutoProcessor,
         AutoTokenizer,
-        BartForCausalLM,
         BartForConditionalGeneration,
         BartTokenizer,
         GPT2LMHeadModel,
         GPT2Tokenizer,
         ImageGPTForCausalImageModeling,
+        PreTrainedModel,
         SpeechEncoderDecoderModel,
-        T5ForConditionalGeneration,
     )
-    from transformers.cache_utils import DynamicCache, EncoderDecoderCache, QuantoQuantizedCache, StaticCache
     from transformers.generation import (
         BeamSampleDecoderOnlyOutput,
         BeamSampleEncoderDecoderOutput,
         BeamSearchDecoderOnlyOutput,
         BeamSearchEncoderDecoderOutput,
+        BeamSearchScorer,
+        ConstrainedBeamSearchScorer,
         DisjunctiveConstraint,
+        ForcedBOSTokenLogitsProcessor,
+        ForcedEOSTokenLogitsProcessor,
         GenerateBeamDecoderOnlyOutput,
         GenerateBeamEncoderDecoderOutput,
         GenerateDecoderOnlyOutput,
         GenerateEncoderDecoderOutput,
-        GenerationConfig,
         GreedySearchDecoderOnlyOutput,
         GreedySearchEncoderDecoderOutput,
+        HammingDiversityLogitsProcessor,
         LogitsProcessorList,
         MaxLengthCriteria,
         MinLengthLogitsProcessor,
+        NoBadWordsLogitsProcessor,
+        NoRepeatNGramLogitsProcessor,
         PhrasalConstraint,
-        PromptLookupCandidateGenerator,
+        RepetitionPenaltyLogitsProcessor,
         SampleDecoderOnlyOutput,
         SampleEncoderDecoderOutput,
         StoppingCriteria,
         StoppingCriteriaList,
-        WatermarkDetector,
-        WatermarkingConfig,
+        TemperatureLogitsWarper,
+        TopKLogitsWarper,
+        TopPLogitsWarper,
     )
-    from transformers.generation.utils import _speculative_sampling
+    from transformers.generation.candidate_generator import AssistedCandidateGenerator, CandidateGenerator
+    from transformers.generation.streamers import BaseStreamer
 
 torch_device = "hpu"
 adapt_transformers_to_gaudi()
@@ -100,84 +91,116 @@ class GenerationTesterMixin:
     input_name = "input_ids"
     max_new_tokens = 3
 
+    def _update_default_model_kwargs(self, model_kwargs):
+        model_kwargs["limit_hpu_graphs"] = False
+        model_kwargs["reuse_cache"] = False
+        model_kwargs["bucket_size"] = -1
+
     def _get_input_ids_and_config(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # TODO: @raushan or @gante, use `model.main_input_name` as the main input instead of relyinn on `input_ids`
-        input_ids = inputs_dict.pop(self.input_name)[:batch_size, :]
-        inputs_dict.pop("attention_mask", None)
-
-        # we don't want encoder-decoder models to start from filled decoder ids
-        inputs_dict.pop("decoder_input_ids", None)
-        inputs_dict.pop("decoder_attention_mask", None)
+        input_ids = inputs_dict[self.input_name]
 
         # cut to half length & take max batch_size 3
         sequence_length = input_ids.shape[-1] // 2
         input_ids = input_ids[:batch_size, :sequence_length]
 
-        # we'll set cache use in each test differently
-        inputs_dict.pop("use_cache", None)
-
-        inputs_dict = {
-            k: v[:batch_size, ...]
-            for k, v in inputs_dict.items()
-            if "head_mask" not in k and isinstance(v, torch.Tensor)
-        }
+        # generate max 3 tokens
+        max_length = input_ids.shape[-1] + 3
         if config.eos_token_id is not None and config.pad_token_id is None:
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             if isinstance(config.eos_token_id, int):
                 config.eos_token_id = [config.eos_token_id]
             config.pad_token_id = config.eos_token_id[0]
-
-        if self.has_attentions:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-        else:
+        # TransfoXL has no attention mask
+        if "transfoxl" in config.__class__.__name__.lower():
             attention_mask = None
-
-        # It is important set the eos_token_id to None to ensure that no sequences
-        # shorter than `max_length` can be generated
-        config.eos_token_id = None
-        config.forced_eos_token_id = None
-
-        return config, input_ids, attention_mask, inputs_dict
-
-    def _get_logits_processor_kwargs(self, do_sample=False, config=None):
-        logits_processor_kwargs = {
+        else:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:batch_size, :sequence_length]
+
+        return config, input_ids, attention_mask, max_length
+
+    @staticmethod
+    def _get_logits_processor_and_kwargs(
+        input_length,
+        eos_token_id,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        max_length=None,
+        diversity_penalty=None,
+    ):
+        process_kwargs = {
+            "min_length": input_length + 1 if max_length is None else max_length - 1,
             "bad_words_ids": [[1, 0]],
+            "no_repeat_ngram_size": 2,
             "repetition_penalty": 1.2,
-            "remove_invalid_values": True,
         }
-        if do_sample:
-            logits_processor_kwargs.update(
-                {
-                    "top_k": 10,
-                    "top_p": 0.7,
-                    "temperature": 0.7,
-                }
+        logits_processor = LogitsProcessorList(
+            (
+                [
+                    HammingDiversityLogitsProcessor(diversity_penalty, num_beams=2, num_beam_groups=2),
+                ]
+                if diversity_penalty is not None
+                else []
             )
-        # TODO (joao, raushan): see this comment for a long-term fix
-        # https://github.com/huggingface/transformers/pull/33593#issuecomment-2361824264)
-        # This is a band-aid for VLM models, to ensure they don't generate image/video tokens which would cause them
-        # to crash. On pretrained models this isn't a risk, as they are trained to not generate these tokens.
-        if config is not None:
-            image_token_index = config.image_token_index if hasattr(config, "image_token_index") else None
-            video_token_index = config.video_token_index if hasattr(config, "video_token_index") else None
-            if image_token_index is not None and image_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([image_token_index])
-            if video_token_index is not None and video_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([video_token_index])
-
-        return logits_processor_kwargs
-
-    def _get_beam_kwargs(self, num_return_sequences=1):
+            + (
+                [
+                    MinLengthLogitsProcessor(process_kwargs["min_length"], eos_token_id),
+                ]
+                if eos_token_id is not None
+                else []
+            )
+            + (
+                [
+                    ForcedBOSTokenLogitsProcessor(forced_bos_token_id),
+                ]
+                if forced_bos_token_id is not None
+                else []
+            )
+            + (
+                [ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id)]
+                if forced_eos_token_id is not None
+                else []
+            )
+            + [
+                NoBadWordsLogitsProcessor(process_kwargs["bad_words_ids"], eos_token_id),
+                NoRepeatNGramLogitsProcessor(process_kwargs["no_repeat_ngram_size"]),
+                RepetitionPenaltyLogitsProcessor(process_kwargs["repetition_penalty"]),
+            ]
+        )
+        return process_kwargs, logits_processor
+
+    @staticmethod
+    def _get_warper_and_kwargs(num_beams):
+        warp_kwargs = {"top_k": 10, "top_p": 0.7, "temperature": 0.7}
+        logits_warper = LogitsProcessorList(
+            [
+                TemperatureLogitsWarper(warp_kwargs["temperature"]),
+                TopKLogitsWarper(top_k=warp_kwargs["top_k"], min_tokens_to_keep=(2 if num_beams > 1 else 1)),
+                TopPLogitsWarper(top_p=warp_kwargs["top_p"], min_tokens_to_keep=(2 if num_beams > 1 else 1)),
+            ]
+        )
+        return warp_kwargs, logits_warper
+
+    @staticmethod
+    def _get_beam_scorer_and_kwargs(batch_size, max_length, num_return_sequences=1):
         beam_kwargs = {
             "early_stopping": False,
             "length_penalty": 2.0,
             "num_beams": 2,
             "num_return_sequences": num_return_sequences,
         }
-        return beam_kwargs
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=beam_kwargs["num_beams"],
+            device=torch_device,
+            length_penalty=beam_kwargs["length_penalty"],
+            do_early_stopping=beam_kwargs["early_stopping"],
+            num_beam_hyps_to_keep=num_return_sequences,
+        )
+        return beam_kwargs, beam_scorer
 
-    def _get_diverse_beam_kwargs(self, num_return_sequences=1):
+    @staticmethod
+    def _get_diverse_beam_scorer_and_kwargs(batch_size, max_length, num_return_sequences=1):
         beam_kwargs = {
             "early_stopping": False,
             "length_penalty": 2.0,
@@ -186,46 +209,93 @@ def _get_diverse_beam_kwargs(self, num_return_sequences=1):
             "num_beam_groups": 2,  # one beam per group
             "diversity_penalty": 2.0,
         }
-        return beam_kwargs
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=beam_kwargs["num_beams"],
+            device=torch_device,
+            length_penalty=beam_kwargs["length_penalty"],
+            do_early_stopping=beam_kwargs["early_stopping"],
+            num_beam_hyps_to_keep=num_return_sequences,
+            num_beam_groups=beam_kwargs["num_beam_groups"],
+        )
+        return beam_kwargs, beam_scorer
 
-    def _get_constrained_beam_kwargs(self, num_return_sequences=1):
+    @staticmethod
+    def _get_constrained_beam_scorer_and_kwargs(batch_size, max_length, constraints, num_return_sequences=1):
         beam_kwargs = {
             "early_stopping": False,
             "length_penalty": 2.0,
             "num_beams": num_return_sequences * 4,
             "num_return_sequences": num_return_sequences,
         }
-        return beam_kwargs
+        beam_scorer = ConstrainedBeamSearchScorer(
+            batch_size=batch_size,
+            constraints=constraints,
+            num_beams=beam_kwargs["num_beams"],
+            device=torch_device,
+            length_penalty=beam_kwargs["length_penalty"],
+            do_early_stopping=beam_kwargs["early_stopping"],
+            num_beam_hyps_to_keep=num_return_sequences,
+        )
+        return beam_kwargs, beam_scorer
+
+    @staticmethod
+    def _get_encoder_outputs(
+        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
+    ):
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
+            num_interleave, dim=0
+        )
+        input_ids = torch.zeros_like(input_ids[:, :1]) + model._get_decoder_start_token_id()
+        attention_mask = None
+        return encoder_outputs, input_ids, attention_mask
+
+    @staticmethod
+    def _get_static_shapes():
+        return False
 
     def _greedy_generate(
         self,
         model,
         input_ids,
         attention_mask,
-        inputs_dict,
+        max_length,
         output_scores=False,
-        output_logits=False,
         output_attentions=False,
         output_hidden_states=False,
         return_dict_in_generate=False,
-        use_cache=True,
     ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
+        if model.config.is_encoder_decoder:
+            max_length = 4
+        logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+            input_ids.shape[-1],
+            eos_token_id=model.config.eos_token_id,
+            forced_bos_token_id=model.config.forced_bos_token_id,
+            forced_eos_token_id=model.config.forced_eos_token_id,
+            max_length=max_length,
+        )
+
         model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        model.generation_config.static_shapes = self._get_static_shapes()
         output_generate = model.generate(
             input_ids,
             do_sample=False,
             num_beams=1,
-            max_new_tokens=self.max_new_tokens,
+            max_length=max_length,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_scores=output_scores,
-            output_logits=output_logits,
             return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **logits_processor_kwargs,
+            remove_invalid_values=True,
+            **logits_process_kwargs,
             **model_kwargs,
-            **inputs_dict,
         )
 
         return output_generate
@@ -235,33 +305,35 @@ def _sample_generate(
         model,
         input_ids,
         attention_mask,
-        inputs_dict,
+        max_length,
         num_return_sequences,
+        logits_processor,
+        logits_warper,
+        logits_warper_kwargs,
+        process_kwargs,
         output_scores=False,
-        output_logits=False,
         output_attentions=False,
         output_hidden_states=False,
         return_dict_in_generate=False,
-        use_cache=True,
     ):
         torch.manual_seed(0)
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=True, config=model.config)
         model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        self._update_default_model_kwargs(model_kwargs)
+        model.generation_config.static_shapes = self._get_static_shapes()
         output_generate = model.generate(
             input_ids,
             do_sample=True,
             num_beams=1,
-            max_new_tokens=self.max_new_tokens,
+            max_length=max_length,
             num_return_sequences=num_return_sequences,
             output_scores=output_scores,
-            output_logits=output_logits,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **logits_processor_kwargs,
+            remove_invalid_values=True,
+            **logits_warper_kwargs,
+            **process_kwargs,
             **model_kwargs,
-            **inputs_dict,
         )
 
         return output_generate
@@ -271,31 +343,31 @@ def _beam_search_generate(
         model,
         input_ids,
         attention_mask,
-        inputs_dict,
+        max_length,
+        beam_scorer,
         beam_kwargs,
+        logits_processor,
+        logits_process_kwargs,
         output_scores=False,
-        output_logits=False,
         output_attentions=False,
         output_hidden_states=False,
         return_dict_in_generate=False,
-        use_cache=True,
     ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
         model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        self._update_default_model_kwargs(model_kwargs)
+        model.generation_config.static_shapes = self._get_static_shapes()
         output_generate = model.generate(
             input_ids,
             do_sample=False,
-            max_new_tokens=self.max_new_tokens,
+            max_length=max_length,
             output_scores=output_scores,
-            output_logits=output_logits,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
+            remove_invalid_values=True,
             **beam_kwargs,
-            **logits_processor_kwargs,
+            **logits_process_kwargs,
             **model_kwargs,
-            **inputs_dict,
         )
 
         return output_generate
@@ -305,34 +377,32 @@ def _beam_sample_generate(
         model,
         input_ids,
         attention_mask,
-        inputs_dict,
+        max_length,
+        beam_scorer,
         beam_kwargs,
+        logits_warper,
+        logits_warper_kwargs,
         output_scores=False,
-        output_logits=False,
         output_attentions=False,
         output_hidden_states=False,
         return_dict_in_generate=False,
-        use_cache=True,
     ):
         torch.manual_seed(0)
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=True, config=model.config)
         model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        self._update_default_model_kwargs(model_kwargs)
         output_generate = model.generate(
             input_ids,
             do_sample=True,
-            max_new_tokens=self.max_new_tokens,
+            max_length=max_length,
             output_scores=output_scores,
-            output_logits=output_logits,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
+            remove_invalid_values=True,
             **beam_kwargs,
-            **logits_processor_kwargs,
+            **logits_warper_kwargs,
             **model_kwargs,
-            **inputs_dict,
         )
-
         return output_generate
 
     def _group_beam_search_generate(
@@ -340,31 +410,30 @@ def _group_beam_search_generate(
         model,
         input_ids,
         attention_mask,
-        inputs_dict,
+        max_length,
+        beam_scorer,
         beam_kwargs,
+        logits_processor,
+        logits_process_kwargs,
         output_scores=False,
-        output_logits=False,
         output_attentions=False,
         output_hidden_states=False,
         return_dict_in_generate=False,
-        use_cache=True,
     ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
         model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        self._update_default_model_kwargs(model_kwargs)
         output_generate = model.generate(
             input_ids,
             do_sample=False,
-            max_new_tokens=self.max_new_tokens,
+            max_length=max_length,
             output_scores=output_scores,
-            output_logits=output_logits,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
+            remove_invalid_values=True,
             **beam_kwargs,
-            **logits_processor_kwargs,
+            **logits_process_kwargs,
             **model_kwargs,
-            **inputs_dict,
         )
 
         return output_generate
@@ -374,33 +443,33 @@ def _constrained_beam_search_generate(
         model,
         input_ids,
         attention_mask,
-        inputs_dict,
+        max_length,
+        constrained_beam_scorer,
         constraints,
         beam_kwargs,
+        logits_processor,
+        logits_process_kwargs,
         output_scores=False,
-        output_logits=False,
         output_attentions=False,
         output_hidden_states=False,
         return_dict_in_generate=False,
-        use_cache=True,
     ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
         model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        self._update_default_model_kwargs(model_kwargs)
+        model.generation_config.static_shapes = self._get_static_shapes()
         output_generate = model.generate(
             input_ids,
             do_sample=False,
-            max_new_tokens=self.max_new_tokens,
+            max_length=max_length,
             output_scores=output_scores,
-            output_logits=output_logits,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
             constraints=constraints,
-            use_cache=use_cache,
             **beam_kwargs,
-            **logits_processor_kwargs,
+            **logits_process_kwargs,
             **model_kwargs,
-            **inputs_dict,
         )
 
         return output_generate
@@ -410,72 +479,76 @@ def _contrastive_generate(
         model,
         input_ids,
         attention_mask,
-        inputs_dict,
+        max_length,
         output_scores=False,
-        output_logits=False,
         output_attentions=False,
         output_hidden_states=False,
         return_dict_in_generate=False,
-        use_cache=True,
     ):
         contrastive_search_kwargs = {
             "penalty_alpha": 0.6,
             "top_k": 5,
         }
 
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
+        if model.config.is_encoder_decoder:
+            max_length = 4
+        logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+            input_ids.shape[-1],
+            eos_token_id=model.config.eos_token_id,
+            forced_bos_token_id=model.config.forced_bos_token_id,
+            forced_eos_token_id=model.config.forced_eos_token_id,
+            max_length=max_length,
+        )
+
         model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
+        self._update_default_model_kwargs(model_kwargs)
+        model.generation_config.static_shapes = self._get_static_shapes()
         output_generate = model.generate(
             input_ids,
             do_sample=False,
             num_beams=1,
-            max_new_tokens=self.max_new_tokens,
+            max_length=max_length,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_scores=output_scores,
-            output_logits=output_logits,
             return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **logits_processor_kwargs,
+            remove_invalid_values=True,
+            **logits_process_kwargs,
             **model_kwargs,
             **contrastive_search_kwargs,
-            **inputs_dict,
         )
 
         return output_generate
 
-    @pytest.mark.generate
     def test_greedy_generate(self):
+        # check `generate()` and `greedy_search()` are equal
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            # test old generation output for backwards compatibility
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
-                model=model, input_ids=input_ids, attention_mask=attention_mask, inputs_dict=inputs_dict
+                model=model, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length
             )
-
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
 
-    @pytest.mark.generate
     def test_greedy_generate_dict_outputs(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=False,
             )
 
             if model.config.is_encoder_decoder:
@@ -491,50 +564,58 @@ def test_greedy_generate_dict_outputs(self):
 
             self._check_outputs(output_generate, input_ids, model.config)
 
-    @pytest.mark.generate
     def test_greedy_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            # enable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
 
             if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
-                self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
+                # only relevant if model has "use_cache"
+                return
 
+            config.use_cache = True
             config.is_decoder = True
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=True,
             )
 
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
             self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
 
-    @pytest.mark.generate
     def test_sample_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
             model = model_class(config).to(torch_device).eval()
+
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=2)
+
             output_generate = self._sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
                 num_return_sequences=1,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
             )
 
             if model.config.is_encoder_decoder:
@@ -542,24 +623,38 @@ def test_sample_generate(self):
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
 
-    @pytest.mark.generate
     def test_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
             model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
             output_generate = self._sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
                 num_return_sequences=2,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=False,
             )
 
             if model.config.is_encoder_decoder:
@@ -575,20 +670,38 @@ def test_sample_generate_dict_output(self):
 
             self._check_outputs(output_generate, input_ids, model.config, num_return_sequences=2)
 
-    @pytest.mark.generate
     def test_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
-            beam_kwargs = self._get_beam_kwargs()
             output_generate = self._beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
             )
 
             if model.config.is_encoder_decoder:
@@ -596,26 +709,72 @@ def test_beam_search_generate(self):
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
 
-    @pytest.mark.generate
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
+
+            output_generate = self._beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
+            )
+            if model.config.is_encoder_decoder:
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
+            else:
+                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+
     def test_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # disable cache
+            config.use_cache = False
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             model = model_class(config).to(torch_device).eval()
-            beam_kwargs = self._get_beam_kwargs()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
             output_generate = self._beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=False,
             )
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
+
+            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
+            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
+
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
@@ -631,139 +790,148 @@ def test_beam_search_generate_dict_output(self):
                 output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
-    @pytest.mark.generate
     def test_beam_search_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
-                self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
+                # only relevant if model has "use_cache"
+                return
 
             model = model_class(config).to(torch_device).eval()
-            beam_kwargs = self._get_beam_kwargs()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
+
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
+            config.use_cache = True
             config.is_decoder = True
             model = model_class(config).to(torch_device).eval()
             output_generate = self._beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=True,
             )
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
             self._check_outputs(
-                output_generate, input_ids, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate, input_ids, model.config, use_cache=True, num_return_sequences=beam_scorer.num_beams
             )
 
-    @require_accelerate
-    @require_torch_multi_accelerator
-    @pytest.mark.generate
-    def test_model_parallel_beam_search(self):
+    @pytest.mark.skip("Beam search sampling is not supported by optimum-habana yet")
+    def test_beam_sample_generate(self):
         for model_class in self.all_generative_model_classes:
-            if "xpu" in torch_device:
-                return unittest.skip(reason="device_map='auto' does not work with XPU devices")
-
-            if model_class._no_split_modules is None:
-                continue
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
-            model = model_class(config).eval()
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.cpu().save_pretrained(tmp_dir)
-                new_model = model_class.from_pretrained(tmp_dir, device_map="auto")
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
 
-                new_model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    max_new_tokens=self.max_new_tokens,
-                    num_beams=2,
-                    **inputs_dict,
-                )
+            model = model_class(config).to(torch_device).eval()
 
-    @pytest.mark.skip("Beam search sampling is not supported by optimum-habana yet")
-    @pytest.mark.generate
-    def test_beam_sample_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            # check `generate()` and `beam_search()` are equal
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
-            model = model_class(config).to(torch_device).eval()
-            beam_kwargs = self._get_beam_kwargs()
             output_generate = self._beam_sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
             )
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly
-            # no quick fix available, since obtaining image embeddings step is very model-specific
-            if any(name in model.__class__.__name__.lower() for name in ("blip", "llava", "paligemma")):
-                prepare_inputs_for_generation_args = set(
-                    inspect.signature(model.prepare_inputs_for_generation).parameters
+            if "inputs_embeds" in set(inspect.signature(model.prepare_inputs_for_generation).parameters):
+                input_embeds = model.get_input_embeddings()(input_ids)
+                beam_kwargs.update({"inputs_embeds": input_embeds})
+                output_generate2 = self._beam_sample_generate(
+                    model=model,
+                    input_ids=None,
+                    attention_mask=attention_mask,
+                    beam_kwargs=beam_kwargs,
+                    logits_warper_kwargs=logits_warper_kwargs,
                 )
-                # `inputs_embeds` input is well supported when `cache_positions` is used, because it means the modeling
-                # code is up to date with our most recent standards
-                if (
-                    "inputs_embeds" in prepare_inputs_for_generation_args
-                    and "cache_positions" in prepare_inputs_for_generation_args
-                ):
-                    input_embeds = model.get_input_embeddings()(input_ids)
-                    beam_kwargs.update({"inputs_embeds": input_embeds})
-                    output_generate2 = self._beam_sample_generate(
-                        model=model,
-                        input_ids=None,
-                        attention_mask=attention_mask,
-                        inputs_dict={},
-                        beam_kwargs=beam_kwargs,
-                    )
 
-            torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2)
+                torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2)
 
     @pytest.mark.skip("Beam search sampling is not supported by optimum-habana yet")
-    @pytest.mark.generate
     def test_beam_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # disable cache
+            config.use_cache = False
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             model = model_class(config).to(torch_device).eval()
-            beam_kwargs = self._get_beam_kwargs()
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
 
             output_generate = self._beam_sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=False,
             )
 
+            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
+            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
+
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
@@ -779,131 +947,192 @@ def test_beam_sample_generate_dict_output(self):
                 output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
-    @pytest.mark.generate
     def test_generate_without_input_ids(self):
-        config, _, _, _ = self._get_input_ids_and_config()
+        config, _, _, max_length = self._get_input_ids_and_config()
 
         # if no bos token id => cannot generate from None
         if config.bos_token_id is None:
-            self.skipTest(reason="bos_token_id is None")
-
-        # hack in case they are equal, otherwise the attn mask will be [0]
-        if config.bos_token_id == config.pad_token_id:
-            config.pad_token_id = None
+            return
 
         for model_class in self.all_generative_model_classes:
             model = model_class(config).to(torch_device)
             model.eval()
 
-            output_ids_generate = model.generate(
-                do_sample=False, max_new_tokens=self.max_new_tokens, remove_invalid_values=True
-            )
+            output_ids_generate = model.generate(do_sample=False, max_length=max_length, remove_invalid_values=True)
             self.assertIsNotNone(output_ids_generate)
 
     @pytest.mark.skip("Group beam search is not supported by optimum-habana")
-    @pytest.mark.generate
     def test_group_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+                diversity_penalty=2.0,
+            )
+
             # check `generate()` and `group_beam_search()` are equal
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
+            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
+            output_generate, output_group_beam_search = self._group_beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
             )
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+            self.assertListEqual(output_generate.tolist(), output_group_beam_search.tolist())
 
-            # check `group_beam_search` for higher than 1 `num_return_sequences`
+            # check `generate()` and `group_beam_search()` are equal for `num_return_sequences`
             num_return_sequences = 2
-            beam_kwargs = self._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
-            output_generate = self._group_beam_search_generate(
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
+            )
+            output_generate, output_group_beam_search = self._group_beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
             )
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+            self.assertListEqual(output_generate.tolist(), output_group_beam_search.tolist())
 
     @pytest.mark.skip("Group beam search is not supported by optimum-habana")
-    @pytest.mark.generate
     def test_group_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             model = model_class(config).to(torch_device).eval()
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+                diversity_penalty=2.0,
+            )
+
+            num_return_sequences = 1
+            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
+            )
+            output_generate, output_group_beam_search = self._group_beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
                 beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=False,
             )
             if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
-                # Retrocompatibility check
+                self.assertIsInstance(output_group_beam_search, BeamSearchEncoderDecoderOutput)
                 self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-                self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
-                # Retrocompatibility check
+                self.assertIsInstance(output_group_beam_search, BeamSearchDecoderOnlyOutput)
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
-            self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
+            self.assertListEqual(output_generate.sequences.tolist(), output_group_beam_search.sequences.tolist())
+            self.assertTrue(
+                torch.allclose(
+                    output_generate["sequences_scores"], output_group_beam_search["sequences_scores"], atol=1e-3
+                )
             )
+            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
+            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
+
+            for output in (output_group_beam_search, output_generate):
+                self._check_outputs(
+                    output, input_ids, model.config, num_return_sequences=num_return_sequences * beam_scorer.num_beams
+                )
 
-    # TODO: @gante
-    @is_flaky()
-    @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             model = model_class(config).to(torch_device).eval()
+            max_length = 20
 
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
+
+            # check `generate()` and `constrained_beam_search()` are equal
             # Sample constraints
-            min_id = 3
-            max_id = config.get_text_config(decoder=True).vocab_size
+            if not input_ids.dtype == torch.float32:
+                min_id = torch.min(input_ids) + 3
+                max_id = torch.max(input_ids)
+            else:
+                # otherwise this throws an error for Speech2TextModel since its inputs are floating points
+                min_id = 3
+                max_id = 100
 
             force_tokens = torch.randint(min_id, max_id, (1, 2)).tolist()[0]
             constraints = [
                 PhrasalConstraint(force_tokens),
             ]
 
-            beam_kwargs = self._get_constrained_beam_kwargs()
+            beam_kwargs, beam_scorer = self._get_constrained_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, constraints, num_return_sequences=1
+            )
             output_generate = self._constrained_beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                constrained_beam_scorer=beam_scorer,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
             )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+            self.assertTrue(output_generate.shape[-1] == max_length)
 
             for generation_output in output_generate:
                 self._check_sequence_inside_sequence(force_tokens, generation_output)
@@ -915,63 +1144,86 @@ def test_constrained_beam_search_generate(self):
                 PhrasalConstraint(force_tokens),
             ]
 
-            beam_kwargs = self._get_constrained_beam_kwargs(num_return_sequences=2)
+            num_return_sequences = 2
+            max_length = 20
+
+            beam_kwargs, beam_scorer = self._get_constrained_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, constraints, num_return_sequences=num_return_sequences
+            )
 
             output_generate = self._constrained_beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                constrained_beam_scorer=beam_scorer,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
             )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
+            self.assertTrue(output_generate.shape[-1] == max_length)
 
             for generation_output in output_generate:
                 self._check_sequence_inside_sequence(force_tokens, generation_output)
 
-    @pytest.mark.generate
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # disable cache
+            config.use_cache = False
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
 
             model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 20
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
 
             # Sample constraints
             min_id = 3
-            max_id = model.config.get_text_config(decoder=True).vocab_size
+            max_id = model.config.vocab_size
             force_tokens = torch.randint(min_id, max_id, (1, 2)).tolist()[0]
             constraints = [
                 PhrasalConstraint(force_tokens),
             ]
 
-            beam_kwargs = self._get_constrained_beam_kwargs()
+            beam_kwargs, beam_scorer = self._get_constrained_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, constraints, num_return_sequences=1
+            )
             output_generate = self._constrained_beam_search_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
+                constrained_beam_scorer=beam_scorer,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=False,
             )
-
+            self.assertTrue(output_generate.sequences.shape[-1] == max_length)
             if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
             else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
                 self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
@@ -980,52 +1232,47 @@ def test_constrained_beam_search_generate_dict_output(self):
                 output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
-    @pytest.mark.generate
+            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
+            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
+
     def test_contrastive_generate(self):
+        # check `generate()` and `contrastive_search()` are equal
         for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
+                return
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+                return
+            config.use_cache = True
             config.is_decoder = True
 
             # test old generation output for backwards compatibility
             model = model_class(config).to(torch_device).eval()
             output_generate = self._contrastive_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                use_cache=True,
+                model=model, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length
             )
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
 
-    @pytest.mark.generate
     def test_contrastive_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
+                return
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            # enable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+                return
+            config.use_cache = True
             config.is_decoder = True
 
             model = model_class(config).to(torch_device).eval()
@@ -1033,40 +1280,36 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
+                max_length=max_length,
                 output_scores=True,
-                output_logits=True,
                 output_hidden_states=True,
-                output_attentions=self.has_attentions,
+                output_attentions=True,
                 return_dict_in_generate=True,
-                use_cache=True,
             )
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
             self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
 
-    @pytest.mark.generate
     def test_contrastive_generate_low_memory(self):
         # Check that choosing 'low_memory' does not change the model output
         for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer", "speech2text"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]):
-                self.skipTest(reason="TODO: fix me")
+            # won't fix: FSMT, Reformer, gptbigcode, and speech2text have a different cache variable type (and format).
+            if any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["fsmt", "reformer", "gptbigcode", "speech2text"]
+            ):
+                return
 
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+                return
 
+            config.use_cache = True
             config.is_decoder = True
 
             # test output equality of low versus high memory
@@ -1077,10 +1320,8 @@ def test_contrastive_generate_low_memory(self):
                 top_k=4,
                 penalty_alpha=0.6,
                 low_memory=True,
-                max_new_tokens=self.max_new_tokens,
+                max_length=max_length,
                 attention_mask=attention_mask,
-                **inputs_dict,
-                use_cache=True,
             )
 
             high_output = model.generate(
@@ -1088,10 +1329,8 @@ def test_contrastive_generate_low_memory(self):
                 top_k=4,
                 penalty_alpha=0.6,
                 low_memory=False,
-                max_new_tokens=self.max_new_tokens,
+                max_length=max_length,
                 attention_mask=attention_mask,
-                **inputs_dict,
-                use_cache=True,
             )
             self.assertListEqual(low_output.tolist(), high_output.tolist())
 
@@ -1138,75 +1377,89 @@ def test_contrastive_generate_dynamic_shapes(self):
             )
             self.assertListEqual(dynamic_output.tolist(), static_output.tolist())
 
-    @pytest.mark.generate
-    @unittest.skip("Started to break with https://github.com/huggingface/transformers/pull/33703")
-    def test_beam_search_low_memory(self):
-        # Check that choosing 'low_memory' does not change the model output
+    # TODO [sasarkar] it is supported now. Enable this test, or delete it if its not applicable
+    @pytest.mark.skip(reason="Assisted decoding not yet supported by optimum-habana")
+    @slow  # TODO(Joao): remove this. Some models (e.g. data2vec, xcom, roberta) have an error rate between 1 and 10%.
+    def test_assisted_decoding_matches_greedy_search(self):
+        # This test ensures that the assisted generation does not introduce output changes over greedy search.
+        # It breaks the pattern in the tests above, for multiple reasons:
+        # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
+        # prepare the assistant encoder outputs in the main generate body);
+        # - assisted_decoding does not support `use_cache = False`
+        # - assisted_decoding does not support `batch_size > 1`
+
         for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="May fix in the future: need custom cache handling")
+            # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
+                return
+            # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes
             if any(
                 model_name in model_class.__name__.lower()
-                for model_name in [
-                    "ctrl",
-                    "gptbigcode",
-                    "transo_xl",
-                    "xlnet",
-                    "cpm",
-                    "jamba",
-                ]
+                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"]
             ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
-            config, input_ids, _, _ = self._get_input_ids_and_config(batch_size=2)
-            # batch_size=1 is ok, but batch_size>1 will cause non-identical output
+                return
 
-            config.use_cache = True
-            config.is_decoder = True
+            # This for loop is a naive and temporary effort to make the test less flaky.
+            failed = 0
+            for i in range(10):
+                # enable cache
+                config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
 
-            # test output equality of low versus high memory
-            model = model_class(config).to(torch_device).eval()
+                # NOTE: assisted generation only works with cache on at the moment.
+                if not hasattr(config, "use_cache"):
+                    return
 
-            low_output = model.generate(
-                input_ids,
-                max_new_tokens=8,
-                num_beams=5,
-                early_stopping=True,
-                low_memory=True,
-                use_cache=True,
-            )
+                config.use_cache = True
+                config.is_decoder = True
+                model = model_class(config).to(torch_device).eval()
+                output_greedy = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_length=max_length,
+                    num_beams=1,
+                    do_sample=False,
+                    output_scores=True,
+                    output_hidden_states=True,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                )
+                # Note: with assisted generate, if the same model is used as assistant, then all assistant tokens will
+                # be correct
+                output_assisted = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    max_length=max_length,
+                    num_beams=1,
+                    do_sample=False,
+                    assistant_model=model,
+                    output_scores=True,
+                    output_hidden_states=True,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                )
 
-            high_output = model.generate(
-                input_ids,
-                max_new_tokens=8,
-                num_beams=5,
-                early_stopping=True,
-                low_memory=False,
-                use_cache=True,
-            )
-            self.assertListEqual(low_output.tolist(), high_output.tolist())
+                try:
+                    self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
 
-    @pytest.mark.generate
-    @parameterized.expand([("random",), ("same",)])
-    @is_flaky()  # Read NOTE (1) below. If there are API issues, all attempts will fail.
-    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
-        # This test ensures that the assisted generation does not introduce output changes over greedy search.
-        # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul
-        # shape differences -- and it may result in a different output. The input shape difference happens in the
-        # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at
-        # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
-        # NOTE (2): It breaks the pattern in the tests above, for multiple reasons:
-        # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
-        # prepare the assistant encoder outputs in the main generate body);
-        # - assisted_decoding does not support `use_cache = False`
-        # - assisted_decoding does not support `batch_size > 1`
+                    for output in (output_greedy, output_assisted):
+                        self._check_outputs(output, input_ids, model.config, use_cache=True)
+                except AssertionError:
+                    failed += 1
+                    if failed > 1:
+                        self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+
+                        for output in (output_greedy, output_assisted):
+                            self._check_outputs(output, input_ids, model.config, use_cache=True)
 
+    # TODO [sasarkar] it is supported now. Enable this test, or delete it if its not applicable
+    @pytest.mark.skip(reason="Assisted decoding not yet supported by optimum-habana")
+    def test_assisted_decoding_sample(self):
+        # In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
+        # match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
+        # different shapes, see https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
         for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
+                self.skipTest("Won't fix: old model with different cache format")
             if any(
                 model_name in model_class.__name__.lower()
                 for model_name in [
@@ -1220,15 +1473,16 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
                     "clvp",
                 ]
             ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
+                self.skipTest("May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, _ = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+                self.skipTest("This model doesn't support caching")
 
+            config.use_cache = True
             config.is_decoder = True
             model = model_class(config).to(torch_device).eval()
             # Sets assisted generation arguments such that:
@@ -1237,253 +1491,503 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
             #    the assistant model is correct
             # c) there are at least two forward passes in the main model, to ensure the input preparation of
             #    the main model is correct
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see c)
-                "num_beams": 1,
-                "do_sample": False,
-                "output_scores": True,
-                "output_logits": True,
-                "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
-                "return_dict_in_generate": True,
-                "use_cache": True,
-            }
-            output_greedy = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            # test with the same assistant model or randomly init one
-            # in the first case all candidate tokens are accepted, in the second none is accepted
-            # case when some are accepted and some not is hard to reproduce, so let's hope this catches most errors :)
-            if assistant_type == "random":
-                assistant_model = model_class(config).to(torch_device).eval()
-            else:
-                assistant_model = model
+            assistant_model = model
             assistant_model.generation_config.num_assistant_tokens = 2  # see b)
             assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
-            generation_kwargs.update({"assistant_model": assistant_model})
-            output_assisted = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            # The two outputs must match and their shape must be as expected
-
-            self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
-            for output in (output_greedy, output_assisted):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
-
-    @is_flaky()
-    @pytest.mark.generate
-    def test_prompt_lookup_decoding_matches_greedy_search(self):
-        # This test ensures that the prompt lookup generation does not introduce output changes over greedy search.
-        # This test is mostly a copy of test_assisted_decoding_matches_greedy_search
-
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
-                    "speech2text",
-                    "git",
-                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
-                ]
-            ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
-
-            # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
-
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            # Sets assisted generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) the prompt lookup tries to give the model 2 tokens, to ensure the input preparation of
-            #    prompt lookup is correct
-            # c) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
             generation_kwargs = {
                 "eos_token_id": -1,  # see a)
                 "max_new_tokens": 4,  # see c)
                 "num_beams": 1,
-                "do_sample": False,
+                "do_sample": True,
+                "assistant_model": assistant_model,
                 "output_scores": True,
-                "output_logits": True,
                 "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
+                "output_attentions": True,
                 "return_dict_in_generate": True,
-                "use_cache": True,
             }
 
-            output_greedy = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            generation_kwargs.update({"prompt_lookup_num_tokens": 2})  # see b)
-            output_prompt_lookup = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            # The two outputs must match and their shape must be as expected
+            #######################################################################
+            # Monkey patch assisted decoding function till SW issue is resolved
+            import copy
+            from types import MethodType
+            from typing import List, Optional, Union
+
+            from transformers.generation.utils import (
+                GenerateDecoderOnlyOutput,
+                _crop_past_key_values,
+                _prepare_attention_mask,
+                _prepare_token_type_ids,
+                _split_model_outputs,
+            )
+
+            def _speculative_sampling(
+                candidate_input_ids,
+                candidate_logits,
+                candidate_length,
+                new_logits,
+                last_assistant_token_is_eos,
+                max_matches,
+            ):
+                """
+                Applies sampling as in the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf, algorithm 1). Returns
+                the selected tokens, as well as the number of candidate matches.
+
+                NOTE: Unless otherwise stated, the variable names match those in the paper.
+                """
+                new_candidate_input_ids = candidate_input_ids[:, -candidate_length:]
+                # Gets the probabilities from the logits. q_i and p_i denote the assistant and model probabilities of the tokens
+                # selected by the assistant, respectively.
+                q = candidate_logits.softmax(dim=-1)
+                q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids.squeeze()].squeeze(0, 1)
+                p = new_logits.softmax(dim=-1)
+                p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids.squeeze()].squeeze(0, 1)
+                probability_ratio = p_i / q_i
+
+                # When probability_ratio > 1 (i.e. q_i(x) < p_i(x), or "assistant probability of the candidate token is smaller
+                # than the model probability for the same token"), keep the token. Otherwise reject with p = 1 - probability_ratio
+                # (= keep with p = probability_ratio). Keep all the tokens until the first rejection
+                r_i = torch.rand_like(probability_ratio)
+                is_accepted = r_i <= probability_ratio
+                n_matches = ((~is_accepted).cumsum(dim=-1) < 1).sum()  # this is `n` in algorithm 1
+
+                # Ensure we don't generate beyond max_len or an EOS token (not in algorithm 1, but needed for correct behavior)
+                if last_assistant_token_is_eos and n_matches == candidate_length:
+                    # Output length is assumed to be `n_matches + 1`. Since we won't generate another token with the target model
+                    # due to acceptance on EOS we fix `n_matches`
+                    n_matches -= 1
+                    valid_tokens = new_candidate_input_ids[:, : n_matches + 1]
+                else:
+                    n_matches = min(n_matches, max_matches)
+
+                    # Next token selection: if there is a rejection, adjust the distribution from the main model before sampling.
+                    gamma = min(candidate_logits.shape[1], max_matches)
+                    p_n_plus_1 = p[:, n_matches, :]
+                    if n_matches < gamma:
+                        q_n_plus_1 = q[:, n_matches, :]
+                        p_prime = torch.clamp((p_n_plus_1 - q_n_plus_1), min=0)
+                        p_prime.div_(p_prime.sum())
+                    else:
+                        p_prime = p_n_plus_1
+                    t = torch.multinomial(p_prime, num_samples=1).squeeze(1)[None, :]
+
+                    # The selected tokens include the matches (if any) plus the next sampled tokens
+                    if n_matches > 0:
+                        valid_tokens = torch.cat((new_candidate_input_ids[:, :n_matches], t), dim=-1)
+                    else:
+                        valid_tokens = t
+
+                return valid_tokens, n_matches
+
+            def assisted_decoding(
+                self,
+                input_ids: torch.LongTensor,
+                assistant_model: Optional["PreTrainedModel"] = None,
+                candidate_generator: Optional["CandidateGenerator"] = None,
+                do_sample: bool = False,
+                logits_processor: Optional[LogitsProcessorList] = None,
+                logits_warper: Optional[LogitsProcessorList] = None,
+                stopping_criteria: Optional[StoppingCriteriaList] = None,
+                pad_token_id: Optional[int] = None,
+                eos_token_id: Optional[Union[int, List[int]]] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                output_scores: Optional[bool] = None,
+                return_dict_in_generate: Optional[bool] = None,
+                synced_gpus: bool = False,
+                streamer: Optional["BaseStreamer"] = None,
+                **model_kwargs,
+            ):
+                r"""
+                Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
+                **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
+                candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
+                models.
+
+                <Tip warning={true}>
+
+                In most cases, you do not need to call [`~generation.GenerationMixin.candidate_decoding`] directly. Use
+                generate() instead. For an overview of generation strategies and code examples, check the [following
+                guide](../generation_strategies).
+
+                </Tip>
+
+                Parameters:
+                    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                        The sequence used as a prompt for the generation.
+                    candidate_generator (`CandidateGenerator`, *optional*):
+                        A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
+                        more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
+                    assistant_model (`PreTrainedModel`, *optional*):
+                        An assistant model that can be used to accelerate generation. The assistant model must have the exact
+                        same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
+                        is much faster than running generation with the model you're calling generate from. As such, the
+                        assistant model should be much smaller.
+                    do_sample (`bool`, *optional*, defaults to `False`):
+                        Whether or not to use sampling ; use greedy decoding otherwise.
+                    logits_processor (`LogitsProcessorList`, *optional*):
+                        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                        used to modify the prediction scores of the language modeling head applied at each generation step.
+                    logits_warper (`LogitsProcessorList`, *optional*):
+                        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                        to warp the prediction score distribution of the language modeling head applied before multinomial
+                        sampling at each generation step.
+                    stopping_criteria (`StoppingCriteriaList`, *optional*):
+                        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                        used to tell if the generation loop should stop.
+                    pad_token_id (`int`, *optional*):
+                        The id of the *padding* token.
+                    eos_token_id (`Union[int, List[int]]`, *optional*):
+                        The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+                    output_attentions (`bool`, *optional*, defaults to `False`):
+                        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                        returned tensors for more details.
+                    output_hidden_states (`bool`, *optional*, defaults to `False`):
+                        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                        for more details.
+                    output_scores (`bool`, *optional*, defaults to `False`):
+                        Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+                    return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+                    synced_gpus (`bool`, *optional*, defaults to `False`):
+                        Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+                    streamer (`BaseStreamer`, *optional*):
+                        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+                    model_kwargs:
+                        Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                        If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+                Return:
+                    [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
+                    `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+                    [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+                    `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+                    `model.config.is_encoder_decoder=True`.
+
+                Examples:
+
+                ```python
+                >>> from transformers import (
+                ...     AutoTokenizer,
+                ...     AutoModelForCausalLM,
+                ...     LogitsProcessorList,
+                ...     MinLengthLogitsProcessor,
+                ...     StoppingCriteriaList,
+                ...     MaxLengthCriteria,
+                ... )
+
+                >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+                >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+                >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+                >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+                >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+                >>> input_prompt = "It might be possible to"
+                >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+                >>> # instantiate logits processors
+                >>> logits_processor = LogitsProcessorList(
+                ...     [
+                ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
+                ...     ]
+                ... )
+                >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+                >>> outputs = model.assisted_decoding(
+                ...     input_ids,
+                ...     assistant_model=assistant_model,
+                ...     logits_processor=logits_processor,
+                ...     stopping_criteria=stopping_criteria,
+                ... )
+                >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+                ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
+                ```"""
+                # handling deprecated arguments
+                if (assistant_model is None) == (candidate_generator is None):
+                    raise ValueError(
+                        "One (and only one) of `assistant_model` and `candidate_generator` should be defined."
+                    )
 
-            self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist())
-            for output in (output_greedy, output_prompt_lookup):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
+                if assistant_model is not None:
+                    candidate_generator = AssistedCandidateGenerator(
+                        input_ids=input_ids,
+                        assistant_model=assistant_model,
+                        logits_processor=logits_processor,
+                        model_kwargs=model_kwargs,
+                        eos_token_id=eos_token_id,
+                    )
+                    warnings.warn(
+                        "Passing `assistant_model` to `assisted_decoding` is deprecated and will be removed in v4.38. "
+                        "Pass the `candidate_generator` argument instead.",
+                        FutureWarning,
+                    )
 
-    @pytest.mark.generate
-    def test_dola_decoding_sample(self):
-        # TODO (joao): investigate skips, try to reduce incompatibilities
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support DoLa decoding")
+                # init values
+                logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+                logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+                stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+                pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+                eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+                if eos_token_id is not None and pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                if isinstance(eos_token_id, int):
+                    eos_token_id = [eos_token_id]
+                eos_token_id_tensor = (
+                    torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+                )
+                output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+                output_attentions = (
+                    output_attentions if output_attentions is not None else self.generation_config.output_attentions
+                )
+                output_hidden_states = (
+                    output_hidden_states
+                    if output_hidden_states is not None
+                    else self.generation_config.output_hidden_states
+                )
+                return_dict_in_generate = (
+                    return_dict_in_generate
+                    if return_dict_in_generate is not None
+                    else self.generation_config.return_dict_in_generate
+                )
 
-            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
-                self.skipTest("Skip Reformer as the lm_head input size is 2 * hidden size, adopted from Rev Nets.")
+                # init attention / hidden states / scores tuples
+                scores = () if (return_dict_in_generate and output_scores) else None
+                decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+                cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+                decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
 
-            if any(model_name in model_class.__name__.lower() for model_name in ["marian", "mbart", "pegasus"]):
-                self.skipTest("DoLa is not supported for models that don't return layerwise hidden states")
+                # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+                if return_dict_in_generate and self.config.is_encoder_decoder:
+                    encoder_attentions = (
+                        model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                    )
+                    encoder_hidden_states = (
+                        model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                    )
 
-            # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+                # keep track of which sequences are already finished
+                unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+
+                # other auxiliary variables
+                max_len = stopping_criteria[0].max_length
+
+                this_peer_finished = False  # used by synced_gpus only
+                while True:
+                    if synced_gpus:
+                        # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                        # The following logic allows an early break if all peers finished generating their sequence
+                        this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                        # send 0.0 if we finished, 1.0 otherwise
+                        torch.dist.all_reduce(this_peer_finished_flag, op=torch.dist.ReduceOp.SUM)
+                        # did all peers finish? the reduced sum will be 0.0 then
+                        if this_peer_finished_flag.item() == 0.0:
+                            break
+
+                    cur_len = input_ids.shape[-1]
+
+                    #  1. Fetch candidate sequences from a `CandidateGenerator`
+                    candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
+                    candidate_input_ids = candidate_input_ids.to(self.device)
+                    if candidate_logits is not None:
+                        candidate_logits = candidate_logits.to(self.device)
+
+                    candidate_length = candidate_input_ids.shape[1] - input_ids.shape[1]
+                    last_assistant_token_is_eos = (
+                        ~candidate_input_ids[:, -1]
+                        .tile(eos_token_id_tensor.shape[0], 1)
+                        .ne(eos_token_id_tensor.unsqueeze(1))
+                        .prod(dim=0)
+                        .bool()
+                    )
 
-            # Encoder-decoder models are not supported
-            if config.is_encoder_decoder:
-                self.skipTest("DoLa is not supported for encoder-decoder models")
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
+                    # 2. Use the original model to obtain the next token logits given the candidate sequence. We obtain
+                    # `candidate_length + 1` relevant logits from this process: in the event that all candidates are correct,
+                    # we use this forward pass to also pick the subsequent logits in the original model.
 
-            if model.get_output_embeddings() is None:
-                self.skipTest("DoLa is not supported for models that don't have output embeddings")
-            # Sets dola generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see b)
-                "num_beams": 1,
-                "do_sample": True,
-                "output_scores": True,
-                "output_logits": True,
-                "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
-                "return_dict_in_generate": True,
-                "use_cache": hasattr(config, "use_cache"),  # Some models don't support the cache
-            }
-            generation_kwargs.update({"dola_layers": "low"})
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs, **inputs_dict)
-            self._check_outputs(output_dola, input_ids, model.config, use_cache=hasattr(config, "use_cache"))
+                    # 2.1. Prepare the model inputs
+                    candidate_kwargs = copy.copy(model_kwargs)
+                    candidate_kwargs = _prepare_attention_mask(
+                        candidate_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
+                    )
+                    candidate_kwargs = _prepare_token_type_ids(candidate_kwargs, candidate_input_ids.shape[1])
 
-    @pytest.mark.generate
-    def test_assisted_decoding_sample(self):
-        # In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
-        # match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
-        # different shapes, see https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
-                    "speech2text",
-                    "git",
-                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
-                ]
-            ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
+                    model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **candidate_kwargs)
 
-            # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
+                    # 2.2. Run a forward pass on the candidate sequence
+                    outputs = self(
+                        **model_inputs,
+                        output_attentions=output_attentions,
+                        output_hidden_states=output_hidden_states,
+                    )
 
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+                    # 2.3. Process the new logits
+                    new_logits = outputs.logits[:, -candidate_length - 1 :]  # excludes the input prompt if present
+                    if len(logits_processor) > 0:
+                        for i in range(candidate_length + 1):
+                            new_logits[:, i, :] = logits_processor(
+                                candidate_input_ids[:, : cur_len + i], new_logits[:, i, :]
+                            )
+                    if len(logits_warper) > 0:
+                        for i in range(candidate_length + 1):
+                            new_logits[:, i, :] = logits_warper(
+                                candidate_input_ids[:, : cur_len + i], new_logits[:, i, :]
+                            )
 
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            # Sets assisted generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of
-            #    the assistant model is correct
-            # c) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
-            assistant_model = model
-            assistant_model.generation_config.num_assistant_tokens = 2  # see b)
-            assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see c)
-                "num_beams": 1,
-                "do_sample": True,
-                "assistant_model": assistant_model,
-                "output_scores": True,
-                "output_logits": True,
-                "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
-                "return_dict_in_generate": True,
-                "use_cache": True,
-            }
-            output_assisted = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
+                    # 3. Select the accepted tokens. There are two possible cases:
+                    # Case 1: `do_sample=True` and we have logits for the candidates (originally from speculative decoding)
+                    # 👉 Apply algorithm 1 from the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf).
+                    max_matches = max_len - cur_len - 1
+                    if do_sample and candidate_logits is not None:
+                        valid_tokens, n_matches = _speculative_sampling(
+                            candidate_input_ids,
+                            candidate_logits,
+                            candidate_length,
+                            new_logits,
+                            last_assistant_token_is_eos,
+                            max_matches,
+                        )
 
-            self._check_outputs(output_assisted, input_ids, config, use_cache=True)
+                    # Case 2: all other cases (originally from assisted generation) 👉 Compare the tokens selected from the
+                    # original model logits with the candidate tokens. We can keep the candidate tokens until the first
+                    # mismatch, or until the max length is reached.
+                    else:
+                        if do_sample:
+                            probs = new_logits.softmax(dim=-1)
+                            selected_tokens = torch.multinomial(probs[0, :, :], num_samples=1).squeeze(1)[None, :]
+                        else:
+                            selected_tokens = new_logits.argmax(dim=-1)
+
+                        candidate_new_tokens = candidate_input_ids[:, cur_len:]
+                        n_matches = ((~(candidate_new_tokens == selected_tokens[:, :-1])).cumsum(dim=-1) < 1).sum()
+
+                        # Ensure we don't generate beyond max_len or an EOS token
+                        if last_assistant_token_is_eos and n_matches == candidate_length:
+                            n_matches -= 1
+                        n_matches = min(n_matches, max_matches)
+                        valid_tokens = selected_tokens[:, : n_matches + 1]
+
+                    # 4. Update variables according to the number of matching assistant tokens. Remember: the token generated
+                    # by the model after the last candidate match is also valid, as it is generated from a correct sequence.
+                    # Because of this last token, assisted generation search reduces to a normal greedy search/sample if there
+                    # is no match.
+
+                    # 4.1. Get the valid continuation, after the matching tokens
+                    input_ids = torch.cat((input_ids, valid_tokens), dim=-1)
+                    if streamer is not None:
+                        streamer.put(valid_tokens.cpu())
+                    new_cur_len = input_ids.shape[-1]
+
+                    # 4.2. Discard past key values relative to unused assistant tokens
+                    new_cache_size = new_cur_len - 1
+                    outputs.past_key_values = _crop_past_key_values(self, outputs.past_key_values, new_cache_size)
+
+                    # 5. Update the candidate generation strategy if needed
+                    candidate_generator.update_candidate_strategy(input_ids, new_logits, n_matches)
+
+                    if synced_gpus and this_peer_finished:
+                        continue  # don't waste resources running the code we don't need
+
+                    # Store scores, attentions and hidden_states when required
+                    # Assistant: modified to append one tuple element per token, as in the other generation methods.
+                    if return_dict_in_generate:
+                        if output_scores:
+                            scores += tuple(new_logits[:, i, :] for i in range(n_matches + 1))
+
+                        if "past_key_values" not in model_kwargs:
+                            added_len = new_cur_len
+                        else:
+                            added_len = n_matches + 1
+
+                        if output_attentions:
+                            if self.config.is_encoder_decoder:
+                                cross_attentions = _split_model_outputs(
+                                    cross_attentions, outputs.cross_attentions, cur_len, added_len
+                                )
+                                decoder_attentions = _split_model_outputs(
+                                    decoder_attentions,
+                                    outputs.decoder_attentions,
+                                    cur_len,
+                                    added_len,
+                                    is_decoder_attention=True,
+                                )
+                            else:
+                                decoder_attentions = _split_model_outputs(
+                                    decoder_attentions,
+                                    outputs.attentions,
+                                    cur_len,
+                                    added_len,
+                                    is_decoder_attention=True,
+                                )
+                        if output_hidden_states:
+                            if self.config.is_encoder_decoder:
+                                decoder_hidden_states = _split_model_outputs(
+                                    decoder_hidden_states, outputs.decoder_hidden_states, cur_len, added_len
+                                )
+                            else:
+                                decoder_hidden_states = _split_model_outputs(
+                                    decoder_hidden_states, outputs.hidden_states, cur_len, added_len
+                                )
+
+                    model_kwargs = self._update_model_kwargs_for_generation(
+                        outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                    )
 
-    @pytest.mark.generate
-    def test_prompt_lookup_decoding_stops_at_eos(self):
-        # This test ensures that the prompt lookup generation stops at eos token and does not suggest more tokens
-        # (see https://github.com/huggingface/transformers/pull/31301)
+                    # if eos_token was found in one sentence, set sentence to finished
+                    if eos_token_id_tensor is not None:
+                        unfinished_sequences = unfinished_sequences.mul(
+                            input_ids[:, -1]
+                            .tile(eos_token_id_tensor.shape[0], 1)
+                            .ne(eos_token_id_tensor.unsqueeze(1))
+                            .prod(dim=0)
+                        )
 
-        # The main idea is to have an ngram (unigram in our case) that is repeated twice in the input ids.
-        # First time at the very end, so input ends with the unigrams, and second any arbitrary location.
-        # Also, we need an EOS token which will be injected just after the arbitrary located ngram.
-        # We verify that PLD will not copy and propose candidated that contain an EOS token, even if there are overlapping ngrams
-        # in input ids. Otherwise a proposed EOS along with the trailing (ngrams-1) tokens might be accepted by the target model.
-        # That seems as if the model "generated" and EOS but didn't stop from user's perspective
+                        # stop when each sentence is finished
+                        if unfinished_sequences.max() == 0:
+                            this_peer_finished = True
+
+                    # stop if we exceed the maximum length
+                    if stopping_criteria(input_ids, scores):
+                        this_peer_finished = True
+
+                    if this_peer_finished and not synced_gpus:
+                        break
+
+                if streamer is not None:
+                    streamer.end()
+
+                if return_dict_in_generate:
+                    if self.config.is_encoder_decoder:
+                        return GenerateEncoderDecoderOutput(
+                            sequences=input_ids,
+                            scores=scores,
+                            encoder_attentions=encoder_attentions,
+                            encoder_hidden_states=encoder_hidden_states,
+                            decoder_attentions=decoder_attentions,
+                            cross_attentions=cross_attentions,
+                            decoder_hidden_states=decoder_hidden_states,
+                            past_key_values=model_kwargs.get("past_key_values"),
+                        )
+                    else:
+                        return GenerateDecoderOnlyOutput(
+                            sequences=input_ids,
+                            scores=scores,
+                            attentions=decoder_attentions,
+                            hidden_states=decoder_hidden_states,
+                            past_key_values=model_kwargs.get("past_key_values"),
+                        )
+                else:
+                    return input_ids
 
-        input_ids = torch.randint(1, 50, (1, 10), device=torch_device)  # generate inputs in range from 1-50
-        arbitrary_ngram = 51  # this is the arbitrary ngram, specifically chosen OOV to prevent flaky tests
-        input_ids[:, 3] = arbitrary_ngram  # set pre-eos to arbitrary_ngram which is for sure not present in inputs
-        input_ids[:, -1] = arbitrary_ngram  # put arbitrary_ngram in the end for the necessary match to happen
+            model.assisted_decoding = MethodType(assisted_decoding, model)
 
-        eos_token_id = torch.tensor([0], device=torch_device)
-        input_ids[:, 4] = eos_token_id  # inject eos-token-id in input ids so that it is located after arbitrary_ngram
+            #######################################################################
 
-        # init cand geenerator with max_matching_ngram_size=1 to match per-token
-        candidate_generator = PromptLookupCandidateGenerator(
-            eos_token_id=eos_token_id, num_output_tokens=4, max_matching_ngram_size=1
-        )
-        output_prompt_lookup = candidate_generator.get_candidates(input_ids)[0]
+            output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
 
-        # PLD shouldn't propose any new tokens based on eos-match
-        self.assertTrue(output_prompt_lookup.shape[-1] == 10)
+            self._check_outputs(output_assisted, input_ids, model.config, use_cache=True)
 
-    @pytest.mark.generate
     def test_generate_with_head_masking(self):
         """Test designed for encoder-decoder models to ensure the attention head masking is used."""
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
             # We want to test only encoder-decoder models
             if not config.is_encoder_decoder:
                 continue
@@ -1509,93 +2013,60 @@ def test_generate_with_head_masking(self):
                     input_ids,
                     attention_mask=attention_mask,
                     num_beams=1,
-                    output_attentions=self.has_attentions,
+                    output_attentions=True,
                     return_dict_in_generate=True,
                     remove_invalid_values=True,
                     **{name: mask},
-                    **inputs_dict,
                 )
                 # We check the state of decoder_attentions and cross_attentions just from the last step
                 attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
                 self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
-    @pytest.mark.generate
     def test_left_padding_compatibility(self):
-        # NOTE: left-padding results in small numerical differences. This is expected.
-        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
-
-        # First, filter out models that don't support left padding
-        # - The model must have generative capabilities
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(reason="No generative architecture available for this model.")
+        # The check done in this test is fairly difficult -- depending on the model architecture, passing the right
+        # position index for the position embeddings can still result in a different output, due to numerical masking.
+        # On the other hand, for some types of position embeddings, an incorrect position index can have a minimal
+        # impact on the output.
+        # There are two tricks employed to check whether left-padding compatibility is in place:
+        # 1 - To reduce the negative impact of the numerical attention mask on a correct position index, we set the
+        # padding size to 1.
+        # 2 - To reduce the chance of false positives (i.e. passing when it should be failing), we run the check
+        # multiple times with random inputs, and it has to pass with all of them.
+        # NOTE: because of 2), there is some chance of false positives in this test.
 
-        # - The model must support padding
-        if not self.has_attentions:
-            self.skipTest(reason="This model doesn't support padding.")
-
-        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
-        decoder_only_classes = []
         for model_class in self.all_generative_model_classes:
             config, _, _, _ = self._get_input_ids_and_config()
             if config.is_encoder_decoder:
-                continue
-            else:
-                decoder_only_classes.append(model_class)
-        if len(decoder_only_classes) == 0:
-            self.skipTest(reason="No decoder-only architecture available for this model.")
-
-        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
-        #   added support for it yet. We skip these models for now.
-        has_encoder_attributes = any(
-            attr_name
-            for attr_name in config.to_dict().keys()
-            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
-        )
-        if has_encoder_attributes:
-            self.skipTest(
-                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
-            )
-
-        # Then, test left-padding
-        def _prepare_model_kwargs(input_ids, attention_mask, signature):
-            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
-            if "position_ids" in signature:
-                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
-                position_ids.masked_fill_(attention_mask == 0, 1)
-                model_kwargs["position_ids"] = position_ids
-            if "cache_position" in signature:
-                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
-                model_kwargs["cache_position"] = cache_position
-            return model_kwargs
-
-        for model_class in decoder_only_classes:
-            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
+                continue  # skip for encoder-decoder models -- they don't need left-padding compatibility
             model = model_class(config).to(torch_device).eval()
             signature = inspect.signature(model.forward).parameters.keys()
 
-            # no cache as some models require special cache classes to be init outside forward
-            model.generation_config.use_cache = False
-
-            # Without padding
-            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
-            next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
-
-            # With left-padding (length 32)
-            # can hardcode pad_token to be 0 as we'll do attn masking anyway
-            pad_token_id = (
-                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
-            )
-            pad_size = (input_ids.shape[0], 32)
-            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
-            padded_input_ids = torch.cat((padding, input_ids), dim=1)
-            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
-            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
-            next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
-
-            # They should result in very similar logits
-            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+            no_failures = True
+            for _ in range(10):  # there may be false positives with 10 runs, we rely on the CI to catch the flakiness
+                _, input_ids, attention_mask, _ = self._get_input_ids_and_config()
+                model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+                if "position_ids" in signature:
+                    position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                    position_ids.masked_fill_(attention_mask == 0, 1)
+                    model_kwargs["position_ids"] = position_ids
+                next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
+
+                pad_size = (input_ids.shape[0], 1)
+                padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id
+                padded_input_ids = torch.cat((padding, input_ids), dim=1)
+                padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+                model_kwargs = {"input_ids": padded_input_ids, "attention_mask": padded_attention_mask}
+                if "position_ids" in signature:
+                    position_ids = torch.cumsum(padded_attention_mask, dim=-1) - 1
+                    position_ids.masked_fill_(padded_attention_mask == 0, 1)
+                    model_kwargs["position_ids"] = position_ids
+                next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
+                if not torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-7):
+                    no_failures = False
+                    break
+
+            self.assertTrue(no_failures)
 
-    @pytest.mark.generate
     def test_past_key_values_format(self):
         # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
         # standard KV cache format is important for a consistent API (and for advanced generation methods).
@@ -1604,7 +2075,7 @@ def test_past_key_values_format(self):
 
             # If it doesn't support cache, pass the test
             if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+                return
 
             model = model_class(config).to(torch_device)
             if "use_cache" not in inputs:
@@ -1613,7 +2084,7 @@ def test_past_key_values_format(self):
 
             # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
             if "past_key_values" not in outputs:
-                self.skipTest(reason="This model doesn't return `past_key_values`")
+                return
 
             num_hidden_layers = (
                 getattr(config, "decoder_layers", None)
@@ -1667,7 +2138,6 @@ def test_past_key_values_format(self):
                         past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
                     )
 
-    @pytest.mark.generate
     def test_generate_from_inputs_embeds_decoder_only(self):
         # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
         # if fails, you should probably update the `prepare_inputs_for_generation` function
@@ -1694,581 +2164,100 @@ def test_generate_from_inputs_embeds_decoder_only(self):
                 continue
 
             # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
+            outputs_from_ids = model.generate(input_ids)
+            self.assertEqual(outputs_from_ids.shape, (2, 20))
 
             # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
             inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
+            outputs_from_embeds = model.generate(input_ids, inputs_embeds=inputs_embeds)
+            self.assertListEqual(outputs_from_ids.tolist(), outputs_from_embeds.tolist())
 
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
+            # But if we pass different inputs_embeds, we should get different outputs
+            torch.manual_seed(0)
             random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
+            outputs_from_rand_embeds = model.generate(input_ids, inputs_embeds=random_embeds)
+            with self.assertRaises(AssertionError):
+                self.assertListEqual(outputs_from_rand_embeds.tolist(), outputs_from_embeds.tolist())
 
             # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
             outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
+                inputs_embeds=inputs_embeds, max_new_tokens=20 - inputs_embeds.shape[1]
             )
             self.assertListEqual(
-                outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids.sequences.tolist(),
+                outputs_from_embeds[:, inputs_embeds.shape[1] :].tolist(),
+                outputs_from_embeds_wo_ids.tolist(),
             )
 
-    @pytest.mark.generate
-    def test_generate_from_inputs_embeds_with_static_cache(self):
-        """
-        Test that StaticCache can generate from inputs_embeds and calculates max_cache_length
-        correctly in `generate()`. We force the model to not stop generation until max-length is reached
-        to verify that the cache length is indeed set correctly and we don't run out of index when slicing the cache.
-        """
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_static_cache:
-                self.skipTest(reason="This model does not support the static cache format")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            if config.is_encoder_decoder:
-                self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
-
-            model = model_class(config).to(torch_device).eval()
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
-                self.skipTest(reason="This model does not support `inputs_embeds` in generation")
-
-            model.config.use_cache = True
-            model.config.is_decoder = True
-            batch_size, seq_length = input_ids.shape
-            max_cache_len = 30
+    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
+        batch_size, seq_length = input_ids.shape
+        num_sequences_in_output = batch_size * num_return_sequences
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
 
-            # here we force to not stop at eos and go until max-length
-            model.generation_config.eos_token_id = model.config.get_text_config().eos_token_id = -1
-            generation_kwargs = {
-                "max_length": max_cache_len,
-                "cache_implementation": "static",
-                "return_dict_in_generate": True,  # Required to return `past_key_values`
-            }
+        # scores
+        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
 
-            text_config = model.config.get_text_config()
-            head_dim = (
-                text_config.head_dim
-                if hasattr(text_config, "head_dim")
-                else text_config.hidden_size // text_config.num_attention_heads
+        # Attentions
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+            # decoder
+            self._check_attentions_for_generate(
+                num_sequences_in_output,
+                output.decoder_attentions,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
             )
-            num_key_value_heads = (
-                text_config.num_attention_heads
-                if getattr(text_config, "num_key_value_heads", None) is None
-                else text_config.num_key_value_heads
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            attentions = output.attentions if not use_cache else output.attentions[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_attentions_for_generate(
+                num_sequences_in_output,
+                attentions=attentions,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
             )
-            num_hidden_layers = text_config.num_hidden_layers
 
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs = model.generate(
-                inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, batch_size, config, seq_length
             )
 
-            # we should get `max_length` in shape, not `max_length - embeds_length`
-            cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
-            self.assertTrue(isinstance(outputs.past_key_values, StaticCache))
-            self.assertTrue(len(outputs.past_key_values.key_cache) == num_hidden_layers)
-            self.assertTrue(outputs.past_key_values.key_cache[0].shape == cache_shape)
-
-    @pytest.mark.generate
-    def test_generate_continue_from_past_key_values(self):
-        # Tests that we can continue generating from past key values, returned from a previous `generate` call
-        for model_class in self.all_generative_model_classes:
-            if any(model_name in model_class.__name__.lower() for model_name in ["imagegpt"]):
-                self.skipTest(reason="Won't fix: old model with unique inputs/caches/other")
-            if any(model_name in model_class.__name__.lower() for model_name in ["umt5"]):
-                self.skipTest(reason="TODO: needs modeling or test input preparation fixes for compatibility")
-
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+            # decoder
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
 
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            # Let's make it always:
-            # 1. use cache (for obvious reasons)
-            # 2. generate to max length (which can be achieved by setting the eos token to an invalid value), which
-            #    would make the test flaky (e.g. EOS is generated on iteration 1 on both generations, but the
-            #    continuation would force it to generate beyond an EOS token)
-            # 3. ignore `token_type_ids` for simplicity
-            # 4. ignore `forced_eos_token_id`, which requires further manipulation of the continuation inputs and is
-            #    active by default on some models
-            # 5. ignore `encoder_no_repeat_ngram_size`, which is set by default in some encoder-decoder models. When
-            #    we use their decoder as a stand-alone model, `encoder_no_repeat_ngram_size` actually prevents
-            #    repetition exclusively from the prompt. This test relies on comparing one call vs 2 calls
-            #    with cache, what is considered a prompt is different in the two cases.
-
-            if "token_type_ids" in inputs:
-                del inputs["token_type_ids"]
-
-            model = model_class(config).to(torch_device)
-            model.eval()
-            model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1
-            model.generation_config.forced_eos_token_id = None
-            model.generation_config.encoder_no_repeat_ngram_size = 0
-            model.generation_config.use_cache = True
-
-            # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
-            outputs = model(**inputs)
-            if "past_key_values" not in outputs:
-                self.skipTest(reason="This model doesn't return `past_key_values`")
-
-            # Traditional way of generating text, with `return_dict_in_generate` to return the past key values
-            outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True)
-
-            # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the
-            # inputs may need to be tweaked across `generate` calls (like the attention mask).
-            outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=3, return_dict_in_generate=True)
-
-            # Continue from the tokens generated above, preparing the inputs accordingly
-            inputs["past_key_values"] = outputs_cached.past_key_values
-            new_attention_len = outputs_cached.sequences.shape[-1]
-            if config.is_encoder_decoder:
-                inputs["decoder_input_ids"] = outputs_cached.sequences
-                if "decoder_attention_mask" in inputs:
-                    inputs["decoder_attention_mask"] = torch.nn.functional.pad(
-                        inputs["decoder_attention_mask"],
-                        (0, new_attention_len - inputs["decoder_attention_mask"].shape[1]),
-                        mode="constant",
-                        value=1,
-                    )
-            else:
-                inputs["input_ids"] = outputs_cached.sequences
-                if "attention_mask" in inputs:
-                    inputs["attention_mask"] = torch.nn.functional.pad(
-                        inputs["attention_mask"],
-                        (0, new_attention_len - inputs["attention_mask"].shape[1]),
-                        mode="constant",
-                        value=1,
-                    )
-            outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=1, return_dict_in_generate=True)
-
-            # The two sets of generated text and past kv should be equal to each other
-            self.assertListEqual(outputs.sequences.tolist(), outputs_cached.sequences.tolist())
-            for layer_idx in range(len(outputs_cached.past_key_values)):
-                for kv_idx in range(len(outputs_cached.past_key_values[layer_idx])):
-                    self.assertTrue(
-                        torch.allclose(
-                            outputs.past_key_values[layer_idx][kv_idx],
-                            outputs_cached.past_key_values[layer_idx][kv_idx],
-                        )
-                    )
-
-    @parameterized.expand([(1, False), (1, True), (4, False)])
-    @pytest.mark.generate
-    def test_new_cache_format(self, num_beams, do_sample):
-        # Tests that generating with the new format is exactly the same as the legacy one (for models that support it).
-        # 👉 tests with and without beam search so that we can test with and without cache reordering.
-        # 👉 tests with and without sampling so we can cover the most common use cases.
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_cache_class:
-                self.skipTest(reason="This model does not support the new cache format")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).to(torch_device).eval()
-            generation_kwargs = {
-                "max_new_tokens": 5,
-                "do_sample": do_sample,
-                "num_beams": num_beams,
-                "num_return_sequences": num_beams,
-                "return_dict_in_generate": True,  # Required to return `past_key_values`
-                "use_cache": True,
-            }
-
-            # Sets seed before calling `generate` for the case with do_sample=True
-            seed = torch.randint(0, 1000000, (1,)).item()
-            set_seed(seed)
-            legacy_results = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-            set_seed(seed)
-            if config.is_encoder_decoder:
-                cache_cls = EncoderDecoderCache
-                past_key_values = cache_cls(DynamicCache(), DynamicCache())
-                past_key_values = cache_cls(DynamicCache(), DynamicCache())
-            else:
-                cache_cls = DynamicCache
-                past_key_values = cache_cls()
-
-            new_results = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-                **generation_kwargs,
-                **inputs_dict,
-            )
-
-            # The two sets of generated sequences must match, despite the cache format between forward passes being
-            # different
-            self.assertListEqual(legacy_results.sequences.tolist(), new_results.sequences.tolist())
-            self.assertTrue(isinstance(legacy_results.past_key_values, tuple))
-            self.assertTrue(isinstance(new_results.past_key_values, cache_cls))
-
-            # The contents of the two caches, when converted to the same format (in both directions!), must match
-            legacy_cache = legacy_results.past_key_values
-            new_cache_converted = new_results.past_key_values.to_legacy_cache()
-            for layer_idx in range(len(legacy_cache)):
-                for kv_idx in range(len(legacy_cache[layer_idx])):
-                    # TODO: @raushan, please look into this for new cache format
-                    if legacy_cache[layer_idx][kv_idx] != []:
-                        self.assertTrue(
-                            torch.allclose(
-                                legacy_cache[layer_idx][kv_idx],
-                                new_cache_converted[layer_idx][kv_idx],
-                            )
-                        )
-
-            new_cache = new_results.past_key_values
-            legacy_cache_converted = cache_cls.from_legacy_cache(legacy_results.past_key_values)
-            for layer_idx in range(len(new_cache)):
-                for kv_idx in range(len(new_cache[layer_idx])):
-                    # TODO: @raushan, please look into this for new cache format
-                    if new_cache[layer_idx][kv_idx] != []:
-                        self.assertTrue(
-                            torch.allclose(
-                                new_cache[layer_idx][kv_idx],
-                                legacy_cache_converted[layer_idx][kv_idx],
-                            )
-                        )
-
-    @pytest.mark.generate
-    def test_generate_with_static_cache(self):
-        """
-        Tests if StaticCache works if we set attn_implementation=static when generation.
-        This doesn't test if generation quality is good, but tests that models with
-        self._supports_static_cache don't throw an error when generating and return
-        a StaticCache object at the end.
-        """
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_static_cache:
-                self.skipTest(reason="This model does not support the static cache format")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            if config.is_encoder_decoder:
-                self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
-
-            config.is_decoder = True
-            batch_size, seq_length = input_ids.shape
-            max_new_tokens = 20
-
-            model = model_class(config).to(torch_device).eval()
-            generation_kwargs = {
-                "max_length": None,
-                "max_new_tokens": max_new_tokens,
-                "cache_implementation": "static",
-                "return_dict_in_generate": True,  # Required to return `past_key_values`
-                "use_cache": True,
-            }
-
-            max_cache_len = seq_length + max_new_tokens
-            config = config.text_config if hasattr(config, "text_config") else config
-            head_dim = (
-                config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
-            )
-            num_key_value_heads = (
-                config.num_attention_heads
-                if getattr(config, "num_key_value_heads", None) is None
-                else config.num_key_value_heads
-            )
-            num_hidden_layers = config.num_hidden_layers
-            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict)
-
-            cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
-            self.assertTrue(isinstance(results.past_key_values, StaticCache))
-            self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers)
-            self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape)
-
-    @require_quanto
-    @pytest.mark.generate
-    def test_generate_with_quant_cache(self):
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_quantized_cache:
-                self.skipTest(reason="This model does not support the quantized cache format")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            config.is_decoder = True
-
-            model = model_class(config).to(torch_device).eval()
-            generation_kwargs = {
-                "max_new_tokens": 5,
-                "cache_implementation": "quantized",
-                # careful with group size, should be divisor of model's hidden size
-                "cache_config": {"backend": "quanto", "nbits": 2, "q_group_size": 8, "residual_length": 128},
-                "return_dict_in_generate": True,  # Required to return `past_key_values`
-                "use_cache": True,
-            }
-
-            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict)
-            self.assertTrue(isinstance(results.past_key_values, QuantoQuantizedCache))
-
-            # passing past key values of different type should raise Error
-            with self.assertRaises(ValueError):
-                num_hidden_layers = config.get_text_config().num_hidden_layers
-                model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    past_key_valyes=DynamicCache(num_hidden_layers),
-                    **generation_kwargs,
-                )
-
-            # setting incorrect cache_config args should raise an Error, i.e. nbits=60 does not make sense
-            generation_kwargs["cache_config"] = {"nbits": 60, "q_group_size": 8, "residual_length": 128}
-            with self.assertRaises(ValueError):
-                model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
-
-    @pytest.mark.generate
-    @require_torch_gpu
-    @slow
-    @is_flaky()  # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky
-    def test_generate_compile_fullgraph(self):
-        """
-        Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results.
-        ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️
-        """
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_static_cache:
-                self.skipTest("This model doesn't support static cache")
-            # TODO (joao) -- fix and enable me :)
-            if any(model_name in model_class.__name__.lower() for model_name in ["whisper"]):
-                self.skipTest("whisper model end-to-end generate compile not yet supported")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            # TODO (joao) -- fix and enable me :)
-            if config.is_encoder_decoder:
-                self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported")
-
-            model = model_class(config).to(torch_device)
-            model.eval()  # otherwise `self.training` is `True` -- this flag is used at attn mask creation time
-
-            input_ids = inputs_dict["input_ids"].to(torch_device)
-            # creates two sets of *different* inputs with the same shape
-            half_batch_size = input_ids.shape[0] // 2
-            input_ids_sets = [input_ids[:half_batch_size, :], input_ids[half_batch_size : half_batch_size * 2, :]]
-            self.assertTrue(input_ids_sets[0].shape == input_ids_sets[1].shape)
-
-            generation_kwargs = {
-                "do_sample": False,
-                "max_new_tokens": 10,
-            }
-
-            max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"]
-            config = config.get_text_config()
-            past_key_values = StaticCache(
-                config, batch_size=half_batch_size, max_cache_len=max_cache_len, device=torch_device
-            )
-
-            for model_inputs in input_ids_sets:
-                # eager dynamic cache
-                output_dynamic = model.generate(model_inputs, **generation_kwargs)
-
-                # end-to-end compiled dynamic cache
-                torch.compiler.reset()
-                compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
-                generation_config = copy.deepcopy(model.generation_config)
-                generation_config.update(**generation_kwargs)
-                output_compiled = compiled_generate(
-                    model_inputs, generation_config=generation_config, past_key_values=past_key_values
-                )
-                self.assertListEqual(output_dynamic.tolist(), output_compiled.tolist())
-
-    @pytest.mark.generate
-    def test_generate_methods_with_num_logits_to_keep(self):
-        for model_class in self.all_generative_model_classes:
-            if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
-                self.skipTest(reason="This model does not support `num_logits_to_keep` argument.")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            config.use_cache = True
-            config.is_decoder = True
-
-            model = model_class(config).to(torch_device).eval()
-            # All generation methods (except assisted decoding) rely on always extracting the last token logits of the
-            # full logits matrix, so testing out only greedy search and assisted decoding is enough (if it works,
-            # other methods will work as well)
-            generation_kwargs = {
-                "max_new_tokens": 10,
-                "do_sample": False,
-            }
-
-            # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
-            with_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict, num_logits_to_keep=0
-            )
-            # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **inputs_dict, **generation_kwargs
-            )
-            self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
-
-    @pytest.mark.generate
-    @is_flaky()  # assisted generation tests are flaky (minor fp ops differences)
-    def test_assisted_decoding_with_num_logits_to_keep(self):
-        for model_class in self.all_generative_model_classes:
-            if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
-                self.skipTest(reason="This model does not support `num_logits_to_keep` argument.")
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
-            config.use_cache = True
-            config.is_decoder = True
-
-            model = model_class(config).to(torch_device).eval()
-            assistant_model = model
-            # All generation methods (except assisted decoding) rely on always extracting the last token logits of the
-            # full logits matrix, so testing out only greedy search and assisted decoding is enough (if it works,
-            # other methods will work as well)
-            generation_kwargs = {
-                "max_new_tokens": 10,
-                "do_sample": False,
-                "assistant_model": assistant_model,
-            }
-
-            assistant_model.generation_config.assistant_confidence_threshold = None
-            # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
-            with_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict, num_logits_to_keep=0
-            )
-            # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **inputs_dict, **generation_kwargs
-            )
-            self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
-
-    @pytest.mark.generate
-    def test_inherits_generation_mixin(self):
-        """
-        Tests that the model class directly inherits `GenerationMixin`, as opposed to relying on `PreTrainedModel`
-        to inherit it.
-        """
-        for model_class in self.all_generative_model_classes:
-            self.assertTrue("GenerationMixin" in str(model_class.__bases__))
-
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, seq_length = input_ids.shape
-        config = config.text_config if hasattr(config, "text_config") else config
-        num_sequences_in_output = batch_size * num_return_sequences
-
-        gen_len = (
-            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
-        )
-
-        # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
-
-        # unprocessed logits
-        self._check_logits(num_sequences_in_output, output.logits, config=config)
-
-        # Attentions
-        if self.has_attentions:
-            if config.is_encoder_decoder:
-                # encoder
-                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
-                # decoder
-                self._check_attentions_for_generate(
-                    num_sequences_in_output,
-                    output.decoder_attentions,
-                    min_length=1,
-                    max_length=output.sequences.shape[-1],
-                    config=config,
-                    use_cache=use_cache,
-                )
-            else:
-                # if use_cache first input is equal to no use_cache, so skip here
-                attentions = output.attentions if not use_cache else output.attentions[1:]
-                min_length = seq_length if not use_cache else seq_length + 1
-                self._check_attentions_for_generate(
-                    num_sequences_in_output,
-                    attentions=attentions,
-                    min_length=min_length,
-                    max_length=output.sequences.shape[-1],
-                    config=config,
-                    use_cache=use_cache,
-                )
-
-        # Hidden States
-        if config.is_encoder_decoder:
-            # encoder
-            self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
-            )
-
-            # decoder
-            self._check_hidden_states_for_generate(
-                num_sequences_in_output,
-                output.decoder_hidden_states,
-                min_length=1,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-        else:
-            # if use_cache first input is equal to no use_cache, so skip here
-            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
-            min_length = seq_length if not use_cache else seq_length + 1
-            self._check_hidden_states_for_generate(
-                num_sequences_in_output,
-                hidden_states,
-                min_length=min_length,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-
-        # Past Key Value States -- a few notes here:
-        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
-        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
-        #    standard cache format (e.g.gptbigcode )
-        models_without_standard_cache = ("ctrl", "fsmt", "gptbigcode", "mega", "reformer", "jamba", "mamba", "xlnet")
-        has_standard_cache = not any(
-            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
-        )
-        if has_standard_cache:
-            if use_cache:
-                past_key_values = output.past_key_values
-                past_sequence_length = output.sequences.shape[-1] - 1
-                self._check_past_key_values_for_generate(
-                    num_sequences_in_output,
-                    past_key_values,
-                    seq_length=past_sequence_length,
-                    config=config,
-                )
-            elif use_cache is False:
-                self.assertTrue(output.past_key_values is None)
-
-    def _check_scores(self, batch_size, scores, length, config):
-        vocab_size = config.get_text_config(decoder=True).vocab_size
-        expected_shape = (batch_size, vocab_size)
-        self.assertIsInstance(scores, tuple)
-        self.assertEqual(len(scores), length)
-        self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores))
-
-    def _check_logits(self, batch_size, scores, config):
-        vocab_size = config.get_text_config(decoder=True).vocab_size
-        self.assertIsInstance(scores, tuple)
-        self.assertListEqual([iter_scores.shape[0] for iter_scores in scores], [batch_size] * len(scores))
-        # vocabulary difference equal to one (imagegptmodel?) or zero (all other models)
-        vocab_diff = vocab_size - scores[0].shape[-1]
-        self.assertTrue(vocab_diff in [0, 1])
-        self.assertListEqual([vocab_size - score.shape[-1] for score in scores], [vocab_diff] * len(scores))
+    def _check_scores(self, batch_size, scores, length, config):
+        expected_shape = (batch_size, config.vocab_size)
+        self.assertIsInstance(scores, tuple)
+        self.assertEqual(len(scores), length)
+        self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores))
 
     def _check_attentions_for_generate(
         self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
@@ -2329,30 +2318,6 @@ def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, c
             [encoder_expected_shape] * len(hidden_states),
         )
 
-    def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config, num_beam_groups=1):
-        self.assertIsInstance(past_key_values, tuple)
-        self.assertListEqual(
-            [isinstance(iter_past_key_values, tuple) for iter_past_key_values in past_key_values],
-            [True] * len(past_key_values),
-        )
-
-        # (batch, head, seq_length, head_features)
-        expected_shape = (
-            batch_size * num_beam_groups,
-            config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads,
-            seq_length,
-            config.hidden_size // config.num_attention_heads,
-        )
-        # check shape key, value
-        self.assertListEqual(
-            [layer_past_key_values[0].shape for layer_past_key_values in past_key_values],
-            [expected_shape] * len(past_key_values),
-        )
-        self.assertListEqual(
-            [layer_past_key_values[1].shape for layer_past_key_values in past_key_values],
-            [expected_shape] * len(past_key_values),
-        )
-
     def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
         # check if tensor_1 inside tensor_2 or tensor_2 inside tensor_1.
         # set to same device. we don't care what device.
@@ -2377,45 +2342,6 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
         self.assertTrue(flag)
 
 
-@require_torch
-class UtilsFunctionsTest(unittest.TestCase):
-    def test_speculative_sampling(self):
-        # assume vocab size 10, input length 5 + 3 generated candidates
-        candidate_input_ids = torch.tensor([[8, 0, 3, 9, 8, 1, 4, 5]])  # input tokens
-        candidate_logits = torch.tensor(
-            [
-                [
-                    [-10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # generated 1
-                    [-10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # generated 4
-                    [-10.0, -10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0],  # generated 5
-                ]
-            ]
-        )
-        candidate_length = 3
-        inf = float("inf")
-        new_logits = torch.tensor(
-            [
-                [
-                    [-10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # accepts 1
-                    [-10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # accepts 4
-                    [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 10.0, -inf],  # rejects 5, accepts 8
-                    [-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # N/A
-                ]
-            ]
-        )
-        last_assistant_token_is_eos = False
-        validated_tokens, n_matches = _speculative_sampling(
-            candidate_input_ids,
-            candidate_logits,
-            candidate_length,
-            new_logits,
-            last_assistant_token_is_eos,
-        )
-        self.assertTrue(n_matches.item() == 2)
-        self.assertTrue(validated_tokens.tolist()[0] == [1, 4, 8])
-
-
-@pytest.mark.generate
 @require_torch
 class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
     # setting framework_dependent_parameters needs to be gated, just like its contents' imports
@@ -2433,7 +2359,6 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi
         }
 
     @slow
-    @pytest.mark.skip("Group beam search is not supported by optimum-habana")
     def test_diverse_beam_search(self):
         # PT-only test: TF doesn't have a diverse beam search implementation
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
@@ -2468,101 +2393,284 @@ def test_diverse_beam_search(self):
             ],
         )
 
-    def test_max_length_if_input_embeds(self):
+    def test_max_length_backward_compat_greedy(self):
         # PT-only test: TF doesn't have StoppingCriteria
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        # Controlling max_length via the configuration is deprecated in favor of max_new_tokens
-        max_new_tokens = 20
-        input_len = input_ids.shape[-1]
-        out_gen = model.generate(input_ids=input_ids, max_new_tokens=max_new_tokens)
-        out_gen_embeds = model.generate(inputs_embeds=inputs_embeds, max_new_tokens=max_new_tokens)
-        self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
+            torch_device
+        )
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
 
-    def test_min_length_if_input_embeds(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        inputs_embeds = model.get_input_embeddings()(input_ids)
+        max_length = 20
+        input_ids = input_ids.expand(2, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
 
-        # Controlling max_length via the configuration is deprecated in favor of max_new_tokens
-        min_length = 10
-        input_len = input_ids.shape[-1]
-        out_gen = model.generate(input_ids=input_ids, min_length=min_length, max_new_tokens=20)
-        out_gen_embeds = model.generate(inputs_embeds=inputs_embeds, min_length=min_length, max_new_tokens=20)
-        self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
+        with self.assertWarns(UserWarning):
+            bart_model.greedy_search(
+                input_ids,
+                max_length=max_length,
+                pad_token_id=bart_model.config.pad_token_id,
+                eos_token_id=bart_model.config.eos_token_id,
+                **model_kwargs,
+            )
 
-    def test_custom_stopping_criteria_overload_error(self):
+    def test_max_length_backward_compat_sample(self):
         # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
-        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
-
+        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
+            torch_device
+        )
         input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        stopping_criteria = StoppingCriteriaList()
-        stopping_criteria.append(MaxLengthCriteria(max_length=42))
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, stopping_criteria=stopping_criteria)
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=32)
 
-    def test_custom_stopping_criteria(self):
+        max_length = 20
+        input_ids = input_ids.expand(2, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
+        with torch.no_grad():
+            with self.assertWarns(UserWarning):
+                bart_model.sample(
+                    input_ids,
+                    max_length=max_length,
+                    pad_token_id=bart_model.config.pad_token_id,
+                    eos_token_id=bart_model.config.eos_token_id,
+                    **model_kwargs,
+                )
+
+    def test_max_length_backward_compat_beam_search(self):
         # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
-        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
+            torch_device
+        )
         input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
 
-        class DummyCriteria(StoppingCriteria):
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                return input_ids.shape[-1] >= 20
-
-        stopping_criteria = StoppingCriteriaList()
-        stopping_criteria.append(DummyCriteria())
+        batch_size = 1
+        max_length = 20
+        num_beams = 2
 
-        output = bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=22)
-        self.assertEqual(
-            list(output.shape),
-            [1, 22],  # still produces the max_length
+        input_ids = input_ids.expand(2, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
         )
-        # make sure final tokens are padding
-        self.assertEqual(output[:, 20:].tolist(), [[bart_model.config.pad_token_id, bart_model.config.pad_token_id]])
 
-        self.assertEqual(
-            list(bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=18).shape),
-            [1, 18],
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
         )
+        with self.assertWarns(UserWarning):
+            _ = bart_model.beam_search(
+                input_ids, num_beams=num_beams, max_length=max_length, beam_scorer=beam_scorer, **model_kwargs
+            )
 
-    # TODO (joao): replace `stop_sequence` in the pipeline by the more recent `generate` functionality
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_stop_sequence_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        prompt = """Hello I believe in"""
-        generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
-        output = generator(prompt)
-        self.assertEqual(
-            output,
-            [{"generated_text": ("Hello I believe in we we we we we we we we we")}],
+    def test_max_length_backward_compat_group_beam_search(self):
+        # PT-only test: TF doesn't have StoppingCriteria & group beam search
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
+            torch_device
         )
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
 
-        output = generator(prompt, stop_sequence=" we")
-        self.assertEqual(output, [{"generated_text": "Hello I believe in we"}])
+        batch_size = 1
+        max_length = 20
+        num_beams = 6
+        num_beam_groups = 3
+        num_return_sequences = num_beams * batch_size
 
-    def test_generate_non_nlp_input_ids_as_kwarg(self):
-        # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
-        model = ImageGPTForCausalImageModeling.from_pretrained(
-            "hf-internal-testing/tiny-random-imagegpt", max_length=10
-        ).to(torch_device)
-        input_ids = ids_tensor((3, 5), vocab_size=10)
+        input_ids = input_ids.expand(6, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
 
-        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
+        diverse_beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+            num_beam_hyps_to_keep=num_return_sequences,
+            num_beam_groups=num_beam_groups,
+        )
+        with self.assertWarns(UserWarning):
+            bart_model.group_beam_search(
+                input_ids, diverse_beam_scorer, num_beams=num_beams, max_length=max_length, **model_kwargs
+            )
+
+    def test_max_length_warning_if_different(self):
+        # PT-only test: TF doesn't have StoppingCriteria
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
+            torch_device
+        )
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        batch_size = 1
+
+        max_length = 20
+        num_beams = 6
+        num_beam_groups = 3
+        num_return_sequences = num_beams * batch_size
+        stopping_criteria_max_length = 18
+        stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=stopping_criteria_max_length)])
+
+        # Greedy
+        input_ids = input_ids.expand(6, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
+            batch_size=input_ids.shape[0],
+            model_input_name=bart_model.main_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
+
+        with self.assertWarns(UserWarning):
+            bart_model.greedy_search(
+                input_ids,
+                max_length=max_length,
+                pad_token_id=bart_model.config.pad_token_id,
+                stopping_criteria=stopping_criteria,
+                eos_token_id=bart_model.config.eos_token_id,
+                **model_kwargs,
+            )
+
+        # Sample
+        with self.assertWarns(UserWarning):
+            with torch.no_grad():
+                bart_model.sample(
+                    input_ids,
+                    max_length=max_length,
+                    stopping_criteria=stopping_criteria,
+                    pad_token_id=bart_model.config.pad_token_id,
+                    eos_token_id=bart_model.config.eos_token_id,
+                    **model_kwargs,
+                )
+
+        # Beam
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+        )
+        with self.assertWarns(UserWarning):
+            with torch.no_grad():
+                bart_model.beam_search(
+                    input_ids,
+                    num_beams=num_beams,
+                    stopping_criteria=stopping_criteria,
+                    max_length=max_length,
+                    beam_scorer=beam_scorer,
+                    **model_kwargs,
+                )
+
+        # Grouped beam search
+        diverse_beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+            num_beam_hyps_to_keep=num_return_sequences,
+            num_beam_groups=num_beam_groups,
+        )
+        with self.assertWarns(UserWarning):
+            bart_model.group_beam_search(
+                input_ids,
+                diverse_beam_scorer,
+                stopping_criteria=stopping_criteria,
+                num_beams=num_beams,
+                max_length=max_length,
+                **model_kwargs,
+            )
+
+    def test_custom_stopping_criteria_overload_error(self):
+        # PT-only test: TF doesn't have StoppingCriteria
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+        stopping_criteria = StoppingCriteriaList()
+        stopping_criteria.append(MaxLengthCriteria(max_length=42))
+        with self.assertRaises(ValueError):
+            bart_model.generate(input_ids, stopping_criteria=stopping_criteria)
+        with self.assertRaises(ValueError):
+            bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=32)
+
+    def test_custom_stopping_criteria(self):
+        # PT-only test: TF doesn't have StoppingCriteria
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        class DummyCriteria(StoppingCriteria):
+            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+                return input_ids.shape[-1] >= 20
+
+        stopping_criteria = StoppingCriteriaList()
+        stopping_criteria.append(DummyCriteria())
+
+        self.assertEqual(
+            list(bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=22).shape),
+            [1, 20],
+        )
+        self.assertEqual(
+            list(bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=18).shape),
+            [1, 18],
+        )
+
+    def test_stop_sequence_stopping_criteria(self):
+        # PT-only test: TF doesn't have StoppingCriteria
+        prompt = """Hello I believe in"""
+        generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
+        output = generator(prompt)
+        self.assertEqual(
+            output,
+            [
+                {
+                    "generated_text": (
+                        "Hello I believe in in in number number number number number number number number number"
+                    )
+                }
+            ],
+        )
+
+        output = generator(prompt, stop_sequence=" number")
+        self.assertEqual(output, [{"generated_text": "Hello I believe in in in number"}])
+
+    def test_generate_non_nlp_input_ids_as_kwarg(self):
+        # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
+        model = ImageGPTForCausalImageModeling.from_pretrained(
+            "hf-internal-testing/tiny-random-imagegpt", max_length=10
+        ).to(torch_device)
+        input_ids = ids_tensor((3, 5), vocab_size=10)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
         output_sequences = model.generate(input_ids).cpu()
 
         self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
@@ -2579,7 +2687,6 @@ def test_generate_input_values_as_encoder_kwarg(self):
         self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
         self.assertEqual(output_sequences.shape, (2, 5))
 
-    @pytest.mark.skip("Group beam search is not supported by optimum-habana")
     def test_transition_scores_group_beam_search_encoder_decoder(self):
         # PT-only test: TF doesn't have group beam search
         articles = [
@@ -2609,61 +2716,13 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
 
         self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
 
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_beam_search_low_memory(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        model_inputs = tokenizer("I", return_tensors="pt")["input_ids"]
-
-        low_output = model.generate(model_inputs, max_new_tokens=40, num_beams=5, early_stopping=True, low_memory=True)
-
-        high_output = model.generate(
-            model_inputs, max_new_tokens=40, num_beams=5, early_stopping=True, low_memory=False
-        )
-        self.assertListEqual(low_output.tolist(), high_output.tolist())
-
-    @slow
-    @pytest.mark.skip("Watermarking is not supported by optimum-habana yet")
-    def test_watermark_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(torch_device)
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        model_inputs = tokenizer("I will be", return_tensors="pt").to(torch_device)
-        input_len = model_inputs["input_ids"].shape[-1]
-
-        # generation should work with both input types: WatermarkingConfig or Dict, so let's check it here :)
-        watermark_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash")
-        _ = model.generate(**model_inputs, watermarking_config=watermark_config, do_sample=False, max_length=15)
-
-        # We will not check watermarked text, since we check it in `logits_processors` tests
-        # Checking if generated ids are as expected fails on different hardware
-        args = {
-            "bias": 2.0,
-            "context_width": 1,
-            "seeding_scheme": "selfhash",
-            "greenlist_ratio": 0.25,
-            "hashing_key": 15485863,
-        }
-        output = model.generate(**model_inputs, do_sample=False, max_length=15)
-        output_selfhash = model.generate(**model_inputs, watermarking_config=args, do_sample=False, max_length=15)
-
-        # Check that the detector is detecting watermarked text
-        detector = WatermarkDetector(model_config=model.config, device=torch_device, watermarking_config=args)
-        detection_out_watermarked = detector(output_selfhash[:, input_len:], return_dict=True)
-        detection_out = detector(output[:, input_len:], return_dict=True)
-
-        self.assertListEqual(detection_out_watermarked.prediction.tolist(), [True])
-        self.assertListEqual(detection_out.prediction.tolist(), [False])
-
     @slow
     def test_beam_search_example_integration(self):
         # PT-only test: TF doesn't have a BeamSearchScorer
         # exactly the example provided in the docstrings of beam search, which previously
         # failed after directly copying from it. Refer to PR #15555
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
+        tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
         encoder_input_str = "translate English to German: How old are you?"
         encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
@@ -2671,15 +2730,31 @@ def test_beam_search_example_integration(self):
         # lets run beam search using 3 beams
         num_beams = 3
         # define decoder start token ids
-        input_ids = torch.ones((1, 1), device=model.device, dtype=torch.long)
+        input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
         input_ids = input_ids * model.config.decoder_start_token_id
 
         # add encoder_outputs to model keyword arguments
-        model_kwargs = {"encoder_outputs": model.get_encoder()(encoder_input_ids, return_dict=True)}
+        model_kwargs = {
+            "encoder_outputs": model.get_encoder()(
+                encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+            )
+        }
 
-        outputs = model.generate(
-            input_ids, num_beams=num_beams, min_length=5, eos_token_id=model.config.eos_token_id, **model_kwargs
+        # instantiate beam scorer
+        beam_scorer = BeamSearchScorer(
+            batch_size=1,
+            num_beams=num_beams,
+            device=model.device,
+        )
+
+        # instantiate logits processors
+        logits_processor = LogitsProcessorList(
+            [
+                MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+            ]
         )
+
+        outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
         outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         self.assertListEqual(outputs, ["Wie alt bist du?"])
@@ -2687,8 +2762,8 @@ def test_beam_search_example_integration(self):
     @slow
     def test_constrained_beam_search(self):
         # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
         force_tokens = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
         force_tokens_2 = tokenizer("big weapons", add_prefix_space=True, add_special_tokens=False).input_ids
@@ -2725,8 +2800,8 @@ def test_constrained_beam_search(self):
     @slow
     def test_constrained_beam_search_mixed(self):
         # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
         force_phrase = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
         flexible_phrases = tokenizer(
@@ -2766,8 +2841,8 @@ def test_constrained_beam_search_mixed(self):
     @slow
     def test_constrained_beam_search_mixed_mixin(self):
         # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
         force_word = "scared"
         force_flexible = ["scream", "screams", "screaming", "screamed"]
@@ -2802,15 +2877,9 @@ def test_constrained_beam_search_mixed_mixin(self):
         )
 
     @slow
-    @pytest.mark.xfail
     def test_cfg_mixin(self):
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        # add pad_token_id for static shape
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-            model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
         input = tokenizer(["The dragon flew over Paris,"], return_tensors="pt", return_attention_mask=True)
         input["input_ids"] = input["input_ids"].to(torch_device)
@@ -2850,8 +2919,8 @@ def test_cfg_mixin(self):
     @slow
     def test_constrained_beam_search_example_translation_mixin(self):
         # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
+        tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
         encoder_input_str = "translate English to German: How old are you?"
         force_words = ["sind"]
@@ -2875,8 +2944,8 @@ def test_constrained_beam_search_example_translation_mixin(self):
     @slow
     def test_constrained_beam_search_example_integration(self):
         # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
+        tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
         encoder_input_str = "translate English to German: How old are you?"
         encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
@@ -2884,65 +2953,38 @@ def test_constrained_beam_search_example_integration(self):
         # lets run beam search using 5 beams
         num_beams = 5
         # define decoder start token ids
-        input_ids = torch.ones((1, 1), device=model.device, dtype=torch.long)
+        input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
         input_ids = input_ids * model.config.decoder_start_token_id
 
         # add encoder_outputs to model keyword arguments
-        model_kwargs = {"encoder_outputs": model.get_encoder()(encoder_input_ids, return_dict=True)}
+        model_kwargs = {
+            "encoder_outputs": model.get_encoder()(
+                encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+            )
+        }
 
         constraint_str = "sind"
         constraint_token_ids = tokenizer.encode(constraint_str)[:-1]  # remove eos token
+        constraints = [PhrasalConstraint(token_ids=constraint_token_ids)]
 
-        outputs = model.generate(
-            input_ids,
-            num_beams=num_beams,
-            force_words_ids=[constraint_token_ids],
-            min_length=5,
-            eos_token_id=model.config.eos_token_id,
-            **model_kwargs,
+        # instantiate beam scorer
+        beam_scorer = ConstrainedBeamSearchScorer(
+            batch_size=1, num_beams=num_beams, device=model.device, constraints=constraints
         )
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
-        self.assertListEqual(outputs, ["Wie alt sind Sie?"])
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    @slow
-    def test_per_row_stopping_criteria(self):
-        text = [
-            "They completed the challenging puzzle, revealing the hidden",
-            "Today a dragon flew over France",
-            "The aroma of freshly baked pizza filled the kitchen",
-        ]
-        stop_strings = ["secrets"]
+        # instantiate logits processors
+        logits_processor = LogitsProcessorList(
+            [
+                MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+            ]
+        )
 
-        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
-        tokenizer.padding_side = "left"
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        input_ids = tokenizer(text, return_tensors="pt", padding="longest", add_special_tokens=False).input_ids.to(
-            torch_device
+        outputs = model.constrained_beam_search(
+            input_ids, beam_scorer, constraints=constraints, logits_processor=logits_processor, **model_kwargs
         )
+        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
-        # normal generation with one stopping criteria
-        out = model.generate(input_ids, max_length=15)
-        out_text = tokenizer.batch_decode(out)
-        expected_out = [
-            "They completed the challenging puzzle, revealing the hidden secrets of the world.\n",
-            "<|endoftext|><|endoftext|><|endoftext|>Today a dragon flew over France and the French government was forced",
-            "The aroma of freshly baked pizza filled the kitchen with a sense of freshness",
-        ]
-        self.assertListEqual(out_text, expected_out)
-
-        # generation should stop at "secrets" for first batch only, filling the rest with eos tokens
-        out = model.generate(input_ids, max_length=15, stop_strings=stop_strings, tokenizer=tokenizer)
-        out_text = tokenizer.batch_decode(out)
-        expected_out = [
-            "They completed the challenging puzzle, revealing the hidden secrets<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>",
-            "<|endoftext|><|endoftext|><|endoftext|>Today a dragon flew over France and the French government was forced",
-            "The aroma of freshly baked pizza filled the kitchen with a sense of freshness",
-        ]
-        self.assertListEqual(out_text, expected_out)
+        self.assertListEqual(outputs, ["Wie alt sind Sie?"])
 
     def test_constrained_beam_search_mixin_type_checks(self):
         # PT-only test: TF doesn't have constrained beam search
@@ -2985,55 +3027,6 @@ def test_constrained_beam_search_mixin_type_checks(self):
         with self.assertRaises(ValueError):
             model.generate(input_ids, force_words_ids=[[[-1]]])
 
-    def test_batched_decoder_start_id(self):
-        # PT-only test: TF doesn't support batched_decoder_start_id
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        decoder_start_token_id = bart_model.generation_config.decoder_start_token_id
-        decoder_start_token_id_batch = [decoder_start_token_id] * input_ids.shape[0]
-
-        outputs = bart_model.generate(input_ids, decoder_start_token_id=decoder_start_token_id)
-
-        outputs_batched_ids = bart_model.generate(input_ids, decoder_start_token_id=decoder_start_token_id_batch)
-
-        self.assertListEqual(outputs.tolist(), outputs_batched_ids.tolist())
-
-    def test_decoder_start_id_from_config(self):
-        # Refer to: (#30899)
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        decoder_start_token_id = bart_model.generation_config.decoder_start_token_id
-
-        # we should be able to take `decoder_start_token_id` from model's generation config if user passes a `GenerationConfig` type
-        outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
-
-        # If the generatoin config has no `decoder_start_token_id` or `bos_token_id`, we will raise an error unless user passes it in config
-        bart_model.generation_config.decoder_start_token_id = None
-        bart_model.generation_config.bos_token_id = None
-        outputs_with_user_id = bart_model.generate(
-            input_ids,
-            generation_config=GenerationConfig(do_sample=False, decoder_start_token_id=decoder_start_token_id),
-        )
-
-        self.assertListEqual(outputs.tolist(), outputs_with_user_id.tolist())
-
-        with self.assertRaises(ValueError):
-            outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
-
     def test_contrastive_search_batched(self):
         # PT-only test: TF doesn't have constrained beam search
         # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)
@@ -3060,27 +3053,6 @@ def test_contrastive_search_batched(self):
         max_score_diff = (output_sequences_batched.scores[0][1] - output_sequences.scores[0][0]).abs().max()
         self.assertTrue(max_score_diff < 1e-5)
 
-    def test_logits_processor_not_inplace(self):
-        # PT-only test: TF fixes were not made
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        out = model.generate(input_ids, output_logits=True, output_scores=True, return_dict_in_generate=True)
-        out_with_temp = model.generate(
-            input_ids,
-            temperature=0.5,
-            do_sample=True,
-            output_logits=True,
-            output_scores=True,
-            return_dict_in_generate=True,
-        )
-
-        # if no logits processor is used, scores == logits. Otherwise, the processor has to modify the scores
-        self.assertListEqual(out.logits[-1].tolist(), out.scores[-1].tolist())
-        self.assertNotEqual(out_with_temp.logits[-1].tolist(), out_with_temp.scores[-1].tolist())
-
     def test_eos_token_id_int_and_list_top_k_top_sampling(self):
         # Has TF equivalent: this test relies on random sampling
         generation_kwargs = {
@@ -3135,10 +3107,6 @@ def forward(self, input_ids, foo=None, **kwargs):
         # because it doesn't do signature filtering.
         class FakeEncoder(bart_model.model.encoder.__class__):
             def forward(self, input_ids, **kwargs):
-                # We remove these to pass gaudi_BartEncoder_forward TypeError
-                kwargs.pop("bucket_size", None)
-                kwargs.pop("bucket_internal", None)
-                kwargs.pop("reduce_recompile", None)
                 return super().forward(input_ids, **kwargs)
 
         fake_encoder = FakeEncoder(bart_model.config, bart_model.model.shared).to(torch_device)
@@ -3153,16 +3121,15 @@ def forward(self, input_ids, **kwargs):
     def test_default_max_length_warning(self):
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.generation_config.pad_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.eos_token_id
 
         text = "Hello world"
         tokenized_inputs = tokenizer([text], return_tensors="pt")
         input_ids = tokenized_inputs.input_ids.to(torch_device)
 
         # Default generation config value of 20 -> emits warning
-        # NOTE: in OH we do not have this warning
-        # with self.assertWarns(UserWarning):
-        #     model.generate(input_ids)
+        with self.assertWarns(UserWarning):
+            model.generate(input_ids)
 
         # Explicitly setting max_length to 20 -> no warning
         with warnings.catch_warnings(record=True) as warning_list:
@@ -3171,805 +3138,7 @@ def test_default_max_length_warning(self):
 
         # Generation config max_length != 20 -> no warning
         with warnings.catch_warnings(record=True) as warning_list:
-            # generation_config is modified -> legacy mode is disabled = generation_config takes precedence
             model.generation_config.max_length = 10
+            model.generation_config._from_model_config = False  # otherwise model.config.max_length=20 takes precedence
             model.generate(input_ids)
             self.assertEqual(len(warning_list), 0)
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_length_warning_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.generation_config.pad_token_id = tokenizer.eos_token_id
-        assistant.generation_config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-
-        # This should not raise any warning that min length is not feasible in candidate generation
-        with warnings.catch_warnings(record=True) as warning_list:
-            model.generate(
-                input_ids,
-                assistant_model=assistant,
-                min_new_tokens=10,
-                max_length=20,
-            )
-            self.assertEqual(len(warning_list), 0)
-
-    def test_default_assisted_generation(self):
-        # Initialize the GenerationConfig object
-        config = GenerationConfig()
-
-        # Check the default values
-        self.assertEqual(config.num_assistant_tokens, 20)
-        self.assertEqual(config.num_assistant_tokens_schedule, "constant")
-        self.assertEqual(config.assistant_confidence_threshold, 0.4)
-        self.assertEqual(config.is_assistant, False)
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_generated_length_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.generation_config.pad_token_id = tokenizer.eos_token_id
-        assistant.generation_config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-        input_length = input_ids.shape[-1]
-
-        out = model.generate(
-            input_ids,
-            assistant_model=assistant,
-            min_new_tokens=10,
-            max_new_tokens=20,
-        )
-        self.assertTrue((10 + input_length) <= out.shape[-1] <= (20 + input_length))
-
-        out = model.generate(
-            input_ids,
-            assistant_model=assistant,
-            min_new_tokens=10,
-        )
-        self.assertTrue((input_length + 10) <= out.shape[-1] <= 20)
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_model_kwarg_assisted_decoding_decoder_only(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.generation_config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-
-        # Traditional way of generating text
-        outputs_normal = model.generate(input_ids)
-        self.assertEqual(outputs_normal.shape, (1, 20))
-
-        # Should be different with token_type_ids
-        outputs_tti = model.generate(
-            input_ids,
-            token_type_ids=torch.zeros(input_ids.shape, dtype=torch.long).to(torch_device),
-        )
-        with self.assertRaises(AssertionError):
-            self.assertListEqual(outputs_tti.tolist(), outputs_normal.tolist())
-
-        # Assistant model
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        assistant.config.pad_token_id = tokenizer.eos_token_id
-
-        # If assisted generation passes model_kwargs correctly, should be same as previous
-        outputs_assisted = model.generate(
-            input_ids,
-            token_type_ids=torch.zeros(input_ids.shape, dtype=torch.long).to(torch_device),
-            assistant_model=assistant,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_tti.tolist())
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_model_kwarg_assisted_decoding_encoder_decoder(self):
-        """
-        Tests that the following scenario is compatible with assisted generation:
-        1. encoder-decoder main model
-        2. encoder-decoder assistant model
-        3. both have a custom input
-        (e.g. Whisper)
-        """
-
-        # PT-only test: TF doesn't support assisted decoding yet.
-        # Bart subclass with a kwarg that distorts the output
-        class FakeBart(BartForConditionalGeneration):
-            def forward(self, input_ids, past_key_values, foo=False, **kwargs):
-                outs = super().forward(input_ids, past_key_values=past_key_values, **kwargs)
-                if foo:
-                    outs["logits"][:, :, :] = 0.0
-                return outs
-
-            def prepare_inputs_for_generation(self, *args, foo=False, encoder_outputs=None, **kwargs):
-                kwargs["encoder_outputs"] = encoder_outputs
-                inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-                inputs["foo"] = foo
-                return inputs
-
-        model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration").to(
-            torch_device
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration")
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-
-        # Traditional way of generating text
-        outputs_normal = model.generate(input_ids)
-        self.assertEqual(outputs_normal.shape, (1, 20))
-
-        # Should be different with foo
-        outputs_foo = model.generate(input_ids, foo=True)
-        with self.assertRaises(AssertionError):
-            self.assertListEqual(outputs_foo.tolist(), outputs_normal.tolist())
-
-        # Assistant model
-        assistant = FakeBart.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration").to(
-            torch_device
-        )
-
-        # If assisted generation passes model_kwargs correctly, should be same as previous
-        outputs_assisted = model.generate(
-            input_ids,
-            foo=True,
-            assistant_model=assistant,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-        # Check that passing encoder_outputs directly also works as expected
-        encoder_outputs = assistant.get_encoder()(input_ids)
-
-        outputs_assisted = model.generate(
-            foo=True,
-            assistant_model=assistant,
-            encoder_outputs=encoder_outputs,
-            assistant_encoder_outputs=encoder_outputs,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_assisted_decoding_encoder_decoder_shared_encoder(self):
-        """
-        Tests that the following scenario is compatible with assisted generation:
-        1. encoder-decoder main model
-        2. decoder-only assistant model
-        3. both have a custom input
-        (e.g. DistilWhisper)
-        """
-
-        # PT-only test: TF doesn't support assisted decoding yet.
-        # Bart subclass with a kwarg called foo that distorts the output
-        class FakeBartSeq2Seq(BartForConditionalGeneration):
-            def forward(self, input_ids, foo=False, **kwargs):
-                outs = super().forward(input_ids, **kwargs)
-                if foo:
-                    outs["logits"][:, :, :] = 0.0
-                return outs
-
-            def prepare_inputs_for_generation(self, *args, foo=False, encoder_outputs=None, **kwargs):
-                kwargs["encoder_outputs"] = encoder_outputs
-                inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-                inputs["foo"] = foo
-                return inputs
-
-        class FakeBartCausalLM(BartForCausalLM):
-            def forward(self, input_ids, attention_mask, past_key_values, foo=False, **kwargs):
-                outs = super().forward(input_ids, attention_mask, past_key_values=past_key_values, **kwargs)
-                if foo:
-                    outs["logits"][:, :, :] = 0.0
-                return outs
-
-            def prepare_inputs_for_generation(self, *args, foo=False, encoder_outputs=None, **kwargs):
-                kwargs["encoder_outputs"] = encoder_outputs
-                inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-                inputs["foo"] = foo
-                return inputs
-
-        model = FakeBartSeq2Seq.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration").to(
-            torch_device
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration")
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-
-        # Traditional way of generating text
-        outputs_normal = model.generate(input_ids)
-        self.assertEqual(outputs_normal.shape, (1, 20))
-
-        # Should be different with foo
-        outputs_foo = model.generate(input_ids, foo=True)
-        with self.assertRaises(AssertionError):
-            self.assertListEqual(outputs_foo.tolist(), outputs_normal.tolist())
-
-        # Assistant model
-        assistant = FakeBartCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-BartForConditionalGeneration"
-        ).to(torch_device)
-
-        # If assisted generation passes model_kwargs correctly, should be same as previous
-        outputs_assisted = model.generate(
-            input_ids,
-            foo=True,
-            assistant_model=assistant,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-        # Check that passing encoder_outputs directly also works as expected
-        encoder_outputs = model.get_encoder()(input_ids)
-
-        outputs_assisted = model.generate(
-            foo=True,
-            assistant_model=assistant,
-            encoder_outputs=encoder_outputs,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_assisted_decoding_num_assistant_tokens_heuristic_schedule(self):
-        # This test ensures that the assisted generation num_assistant_tokens 'heuristic' schedule works properly.
-
-        prompt = "Alice and Bob"
-        checkpoint = "EleutherAI/pythia-160m-deduped"
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-        inputs = tokenizer(prompt, return_tensors="pt")
-
-        model = AutoModelForCausalLM.from_pretrained(checkpoint)
-
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 5
-        assistant_model.generation_config.num_assistant_tokens_schedule = "heuristic"
-        generation_kwargs = {
-            "eos_token_id": -1,
-            "max_new_tokens": 5,
-            "do_sample": False,
-            "assistant_model": assistant_model,
-        }
-        model.generate(**inputs, **generation_kwargs)
-        # update_candidate_strategy is called only once and therefore, assistant_model.generation_config.num_assistant_tokens should be either 4 or 7
-        self.assertTrue(assistant_model.generation_config.num_assistant_tokens in (4, 7))
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_assisted_decoding_num_assistant_tokens_heuristic_transient_schedule(self):
-        # This test ensures that the assisted generation num_assistant_tokens 'heuristic' schedule works properly.
-
-        prompt = "Alice and Bob"
-        checkpoint = "EleutherAI/pythia-160m-deduped"
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-        inputs = tokenizer(prompt, return_tensors="pt")
-
-        model = AutoModelForCausalLM.from_pretrained(checkpoint)
-
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 5
-        assistant_model.generation_config.num_assistant_tokens_schedule = "heuristic_transient"
-        generation_kwargs = {
-            "eos_token_id": -1,
-            "max_new_tokens": 5,
-            "do_sample": False,
-            "assistant_model": assistant_model,
-        }
-        model.generate(**inputs, **generation_kwargs)
-        # update_candidate_strategy is called once but assistant_model.generation_config.num_assistant_tokens should stay 5
-        self.assertEqual(assistant_model.generation_config.num_assistant_tokens, 5)
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @slow
-    @pytest.mark.xfail
-    def test_validate_assistant(self):
-        # Generate a random sample:
-        inputs = np.random.rand(160000)
-
-        # Load a main encoder-decoder model:
-        model_id = "openai/whisper-large-v2"
-        processor = AutoProcessor.from_pretrained(model_id)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_id,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-        )
-        model.to(torch_device)
-
-        # process the input:
-        features = processor(inputs, return_tensors="pt").to(torch_device)
-
-        # Load an encoder-decoder assistant with same encoder as the main model:
-        assistant_distil_model_id = "distil-whisper/distil-large-v2"
-        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
-            assistant_distil_model_id,
-            use_safetensors=True,
-        ).to(torch_device)
-        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
-
-        # Load its decoder only version:
-        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
-            assistant_distil_model_id,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-        ).to(torch_device)
-        self.assertTrue(model.generate(**features, assistant_model=assistant_causal_lm).sum())
-
-        # Load an encoder-decoder assistant with a different encoder than the main model:
-        assistant_distil_model_id = "openai/whisper-tiny"
-        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
-            assistant_distil_model_id,
-            use_safetensors=True,
-        ).to(torch_device)
-        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
-
-        # Load its decoder only version:
-        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
-            assistant_distil_model_id,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-        ).to(torch_device)
-        # It will raise an error as the encoder of the main and assistant model are not compatible:
-        with self.assertRaises(ValueError):
-            model.generate(**features, assistant_model=assistant_causal_lm)
-
-        # Load an encoder-decoder model with a different tokenizer than the main model:
-        assistant_distil_model_id = "hf-internal-testing/tiny-random-SeamlessM4Tv2ForSpeechToText"
-        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
-            assistant_distil_model_id,
-        ).to(torch_device)
-        # This should raise an error as the main and assistant model don't use the same tokenizer:
-        with self.assertRaises(ValueError):
-            model.generate(**features, assistant_model=assistant_seq_to_seq)
-
-    def test_compare_unprocessed_logit_scores(self):
-        # Get unprocessed logit scores back from model generate function.
-        # Assert that unprocessed logits from generate() are same as those from modal eval()
-
-        # tell model to generate text and return unprocessed/unwarped logit scores
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = "generate yes or no: "
-        input_ids = tokenizer([text], return_tensors="pt").input_ids.to(torch_device)
-
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        with torch.no_grad():
-            # Get logits for the next token from fwd pass
-            logits_fwd = model(input_ids).logits[:, -1, :][0]
-
-        # Get logits for the next token from generate function
-        outputs = model.generate(
-            input_ids=input_ids,
-            return_dict_in_generate=True,
-            output_logits=True,
-            max_new_tokens=1,
-            do_sample=True,
-        )
-        logits_gen = outputs.logits[0][0]
-
-        # assert that unprocessed logits from generate() are same as those from modal eval()
-        self.assertListEqual(logits_fwd.tolist(), logits_gen.tolist())
-
-    def test_return_unprocessed_logit_scores(self):
-        # tell model to generate text and return unprocessed/unwarped logit scores
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = "generate yes or no: "
-        input_ids = tokenizer([text], return_tensors="pt").input_ids.to(torch_device)
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        outputs = model.generate(
-            input_ids=input_ids, return_dict_in_generate=True, output_logits=True, max_new_tokens=3
-        )
-
-        # perform dummy check if unpreprocessed logits make sense.
-        # do preselection on high probabilities; find scores of y and n tokens
-        probs_all = torch.nn.functional.softmax(outputs.logits[2][0], dim=-1)
-        indices = torch.argwhere(probs_all > 0.001)
-        indices = indices[:, -1]
-        tokens_max = tokenizer.batch_decode(indices, skip_special_tokens=True)
-        probs_max = probs_all[probs_all > 0.001]
-
-        self.assertTrue(len(indices) >= 2)
-        next_token_dict = {str(t): p for t, p in zip(tokens_max, probs_max)}
-        self.assertTrue("n" in next_token_dict)
-        self.assertTrue("y" in next_token_dict)
-        y_prob = next_token_dict["y"]
-        n_prob = next_token_dict["n"]
-
-        self.assertTrue(y_prob > 0.001 and n_prob > 0.001)
-        self.assertTrue(y_prob <= 1.0 and n_prob <= 1.0)
-
-    @slow
-    @require_torch_multi_gpu
-    def test_assisted_decoding_in_different_gpu(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda:0")
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
-            "cuda:1"
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
-        model.config.pad_token_id = tokenizer.eos_token_id
-        assistant.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-        input_length = input_ids.shape[-1]
-
-        out = model.generate(
-            input_ids,
-            assistant_model=assistant,
-            max_new_tokens=20,
-        )
-        self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
-
-    @slow
-    @require_torch_gpu
-    def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda")
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
-            "cpu"
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
-        model.config.pad_token_id = tokenizer.eos_token_id
-        assistant.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-        input_length = input_ids.shape[-1]
-
-        out = model.generate(
-            input_ids,
-            assistant_model=assistant,
-            max_new_tokens=20,
-        )
-        self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
-
-    def test_special_tokens_fall_back_to_model_default(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
-            torch_device
-        )
-        test_bos_id = 50
-
-        # Sanity-check: the model has a BOS token set, and the first generated token is a BOS token
-        gen_output = model.generate()
-        self.assertTrue(model.generation_config.bos_token_id is not None)
-        self.assertTrue(model.generation_config.bos_token_id == gen_output[0, 0])
-
-        # If we pass a generation config **with** a BOS token, `generate` will use it
-        generation_config = GenerationConfig(bos_token_id=test_bos_id)
-        gen_output = model.generate(generation_config=generation_config)
-        self.assertFalse(model.generation_config.bos_token_id == gen_output[0, 0])
-        self.assertTrue(generation_config.bos_token_id == gen_output[0, 0])
-        self.assertTrue(test_bos_id == gen_output[0, 0])
-
-        # If we pass a generation config **without** a BOS token, `generate` will fetch the BOS token from
-        # `model.generation_config`
-        generation_config = GenerationConfig(bos_token_id=None)
-        gen_output = model.generate(generation_config=generation_config)
-        self.assertTrue(model.generation_config.bos_token_id == gen_output[0, 0])
-        self.assertFalse(test_bos_id == gen_output[0, 0])
-        self.assertTrue(generation_config.bos_token_id is None)
-
-        # Changing `model.generation_config` will affect fallback behavior
-        model.generation_config.bos_token_id = test_bos_id
-        gen_output = model.generate(generation_config=generation_config)
-        self.assertTrue(model.generation_config.bos_token_id == gen_output[0, 0])
-        self.assertTrue(test_bos_id == gen_output[0, 0])
-        self.assertTrue(generation_config.bos_token_id is None)
-
-    @pytest.mark.generate
-    @require_torch_multi_gpu
-    def test_generate_with_static_cache_multi_gpu(self):
-        """
-        Tests if the static cache has been set correctly and if generate works correctly when we are using multi-gpus.
-        """
-        # need to split manually as auto doesn't work well with unbalanced model
-        device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
-        model = AutoModelForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-
-        generation_kwargs = {
-            "max_new_tokens": 20,
-            "cache_implementation": "static",
-            "return_dict_in_generate": True,  # Required to return `past_key_values`
-        }
-
-        results = model.generate(input_ids, **generation_kwargs)
-        self.assertTrue(isinstance(results.past_key_values, StaticCache))
-
-        # check device of each layer
-        key_cache_0 = results.past_key_values.key_cache[0]
-        value_cache_0 = results.past_key_values.value_cache[0]
-        self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0))
-
-        key_cache_1 = results.past_key_values.key_cache[1]
-        value_cache_1 = results.past_key_values.value_cache[1]
-        self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1))
-
-    @pytest.mark.generate
-    @require_torch_multi_gpu
-    def test_init_static_cache_multi_gpu(self):
-        """
-        Tests if the static cache has been set correctly when we initialize it manually in a multi-gpu setup.
-        """
-        # need to split manually as auto doesn't work well with unbalanced model
-        device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
-        model = AutoModelForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-
-        generation_kwargs = {
-            "max_new_tokens": 20,
-            "return_dict_in_generate": True,  # Required to return `past_key_values`
-        }
-
-        # TODO: We need to raise a warning in case the cache is not set correctly
-        # with self.assertRaisesRegex(ValueError, "If you are manually initializing the cache"):
-        #     past_key_values = StaticCache(
-        #         config=model.config, batch_size=1, max_cache_len=30, device=torch_device, dtype=model.dtype
-        #     )
-        #     results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs)
-
-        # deduced from the device_map : layer 0 on device 0 and layer 1 on device 1
-        layer_device_map = {0: 0, 1: 1}
-        past_key_values = StaticCache(
-            config=model.config,
-            batch_size=1,
-            max_cache_len=30,
-            device=torch_device,
-            dtype=model.dtype,
-            layer_device_map=layer_device_map,
-        )
-        results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs)
-
-        # check device of each layer
-        key_cache_0 = results.past_key_values.key_cache[0]
-        value_cache_0 = results.past_key_values.value_cache[0]
-        self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0))
-
-        key_cache_1 = results.past_key_values.key_cache[1]
-        value_cache_1 = results.past_key_values.value_cache[1]
-        self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1))
-
-    @slow
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_padding_input_contrastive_search_gpt2(self):
-        # Load the pre-trained GPT-2 model and tokenizer
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        model.to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", clean_up_tokenization_spaces=True)
-
-        # Set the tokenizer to left-pad the sequences
-        tokenizer.padding_side = "left"
-
-        # Define the PAD token as the EOS token
-        tokenizer.pad_token = tokenizer.eos_token
-        model.generation_config.pad_token_id = model.generation_config.eos_token_id
-
-        # Define the input prompt
-        prompt_text = "The whispered legends of the haunted mansion spoke"
-
-        # Tokenize the input prompt
-        encoded_prompt = tokenizer(prompt_text, return_tensors="pt", padding=True)
-        input_ids = encoded_prompt.input_ids.to(torch_device)
-        attention_mask = encoded_prompt.attention_mask.to(torch_device)
-
-        # Define the contrastive search params
-        penalty_alpha = 0.6
-        top_k = 4
-
-        # Define the padding length to add to the input IDs and attention mask
-        padding_length = 10
-
-        # Generate text without padding
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_no_padding = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-        # Pad the input IDs and attention mask on the left
-        padded_input_ids = F.pad(
-            input_ids, (padding_length, 0), "constant", value=model.generation_config.pad_token_id
-        )
-        padded_attention_mask = F.pad(attention_mask, (padding_length, 0), "constant", value=0)
-
-        # Generate text with padded inputs
-        outputs_with_padding = model.generate(
-            input_ids=padded_input_ids,
-            attention_mask=padded_attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_with_padding = tokenizer.decode(outputs_with_padding[0], skip_special_tokens=True)
-
-        # Assert that the generated texts are identical for padded and non-padded inputs
-        self.assertEqual(generated_text_no_padding, generated_text_with_padding)
-        self.assertEqual(
-            generated_text_with_padding,
-            'The whispered legends of the haunted mansion spoke of the "souls of the dead" who were "falling '
-            'out of the sky" and "falling into the sea."\n\nThe ghostly apparitions were said to have been '
-            'created by the spirits of the dead, who were "falling out of the sky" and "falling into the sea',
-        )
-
-    @slow
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_padding_input_contrastive_search_t5(self):
-        # Load the pre-trained T5 model and tokenizer
-        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        model.to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", clean_up_tokenization_spaces=True)
-
-        # Define the input prompt
-        prompt_text = "translate English to German: I need to finish this task before the end of the day."
-
-        # Tokenize the input prompt
-        encoded_prompt = tokenizer(prompt_text, return_tensors="pt")
-        input_ids = encoded_prompt.input_ids.to(torch_device)
-        attention_mask = encoded_prompt.attention_mask.to(torch_device)
-
-        # Define the decoder prompt
-        decoder_prompt_text = "Ich muss diese Aufgabe"
-        encoded_decoder_prompt = tokenizer(decoder_prompt_text, add_special_tokens=False, return_tensors="pt")
-        decoder_input_ids = encoded_decoder_prompt.input_ids.to(torch_device)
-        decoder_attention_mask = encoded_decoder_prompt.attention_mask.to(torch_device)
-
-        # Define the contrastive search params
-        penalty_alpha = 0.6
-        top_k = 4
-
-        # Generate text without padding
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_no_padding = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-        # Define the padding length to add to the input IDs and attention mask
-        padding_length = 10
-
-        # Pad the decoder input IDs and attention mask on the left
-        padded_decoder_input_ids = F.pad(
-            decoder_input_ids, (padding_length, 0), "constant", value=model.generation_config.pad_token_id
-        )
-        padded_decoder_attention_mask = F.pad(decoder_attention_mask, (padding_length, 0), "constant", value=0)
-        # Since the decoder_start_token_id is the same as the pad_token_id,
-        # the last padded token represents the decoder start token.
-        # Set the attention mask for the decoder_start_token_id to True (1).
-        padded_decoder_attention_mask[:, padding_length - 1] = 1
-        # Generate text with padded inputs
-        outputs_with_padding = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=padded_decoder_input_ids,
-            decoder_attention_mask=padded_decoder_attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_with_padding = tokenizer.decode(outputs_with_padding[0], skip_special_tokens=True)
-
-        # Assert that the generated texts are identical for padded and non-padded inputs
-        self.assertEqual(generated_text_no_padding, generated_text_with_padding)
-        self.assertEqual(generated_text_no_padding, "Ich muss diese Aufgabe vor Ende des Tages beenden.")
-
-    # TODO [gustavo] Enable this test to Optimum-habana
-    @pytest.mark.xfail
-    def test_generate_compile_fullgraph_tiny(self):
-        """
-        Tests that we can call end-to-end generation with a tiny model (i.e. doesn't crash)
-        NOTE: this test is quite slow (~20s on a consumer desktop), but it is important that we keep it as part of the
-        non-slow tests to prevent regressions!
-        """
-        model = AutoModelForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-LlamaForCausalLM", torch_dtype=torch.bfloat16, device_map="auto"
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
-
-        # compile generate
-        compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
-
-        # compiled generate does NOT accept parameterization except a) model inputs b) a generation config
-        generation_config = copy.deepcopy(model.generation_config)
-        generation_config.pad_token_id = model.config.eos_token_id
-
-        model_inputs = tokenizer(["Write a poem about the market crashing in summer"], return_tensors="pt")
-        model_inputs = model_inputs.to(model.device)
-        gen_out = compiled_generate(**model_inputs, generation_config=generation_config)
-        self.assertTrue(gen_out.shape[1] > model_inputs["input_ids"].shape[1])  # some text was generated
-
-
-@require_torch
-class TokenHealingTestCase(unittest.TestCase):
-    @parameterized.expand(
-        [
-            (
-                "square_bracket",
-                'An example ["like this"] and another example [',
-                'An example ["like this"] and another example ["',
-            ),
-            ("url", 'The link is <a href="http:', 'The link is <a href="http://'),
-            # aggressive_healing: "http" shouldn't be replaced with "https"
-            ("aggressive_healing", 'The link is <a href="http', 'The link is <a href="http'),
-            ("trailing_whitespace", "I read a book about ", "I read a book about"),
-            ("nothing_to_heal", "I read a book about", "I read a book about"),
-            ("single_token", "I", "I"),
-            ("empty_prompt", "", ""),
-        ]
-    )
-    @require_auto_gptq
-    def test_prompts(self, name, input, expected):
-        model_name_or_path = "TheBloke/deepseek-llm-7B-base-GPTQ"
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
-        completion_model = AutoModelForCausalLM.from_pretrained(
-            model_name_or_path,
-            device_map="auto",
-            trust_remote_code=False,
-            revision="main",
-            use_cache=True,
-        )
-        input_ids = tokenizer(input, return_tensors="pt").input_ids.to(completion_model.device)
-
-        healed_ids = completion_model.heal_tokens(input_ids)
-        predicted = tokenizer.decode(healed_ids[0], skip_special_tokens=True)
-
-        self.assertEqual(predicted, expected)
-
-    def test_generate_from_inputs_embeds_with_bos_token_id_is_none(self):
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        # Controlling max_length via the configuration is deprecated in favor of max_new_tokens
-        model.generate(inputs_embeds=inputs_embeds, max_new_tokens=20, bos_token_id=None)
-
-        # bos_token_id is required when no input ids nor inputs_embeds is passed
-        with self.assertRaises(ValueError):
-            model.generate(max_new_tokens=20, bos_token_id=None)
diff --git a/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py b/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
index eae4e5571a..b479f2b237 100644
--- a/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
@@ -392,9 +392,9 @@ def create_and_check_cached_forward_with_and_without_attention_mask(self, config
         model.eval()
 
         # We want this for SDPA, eager works with a `None` attention mask
-        assert (
-            model.config._attn_implementation == "sdpa"
-        ), "This test assumes the model to have the SDPA implementation for its attention calculations."
+        assert model.config._attn_implementation == "sdpa", (
+            "This test assumes the model to have the SDPA implementation for its attention calculations."
+        )
 
         # Prepare cache and non_cache input, needs a full attention mask
         cached_len = input_ids.shape[-1] // 2
diff --git a/tests/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 14561c2080..5026ff87d8 100644
--- a/tests/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -213,9 +213,9 @@ def create_and_check_cached_forward_with_and_without_attention_mask(self, config
         model.to(torch_device)
         model.eval()
         # We want this for SDPA, eager works with a `None` attention mask
-        assert (
-            model.config._attn_implementation == "sdpa"
-        ), "This test assumes the model to have the SDPA implementation for its attention calculations."
+        assert model.config._attn_implementation == "sdpa", (
+            "This test assumes the model to have the SDPA implementation for its attention calculations."
+        )
         # Prepare cache and non_cache input, needs a full attention mask
         cached_len = input_ids.shape[-1] // 2
         input_mask = torch.ones(size=input_ids.size()).to(torch_device)
diff --git a/tests/transformers/tests/test_modeling_common.py b/tests/transformers/tests/test_modeling_common.py
index e08860278b..55c7aa8dae 100755
--- a/tests/transformers/tests/test_modeling_common.py
+++ b/tests/transformers/tests/test_modeling_common.py
@@ -2261,9 +2261,9 @@ def test_model_is_small(self):
         for model_class in self.all_model_classes:
             model = model_class(config)
             num_params = model.num_parameters()
-            assert (
-                num_params < 1000000
-            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
+            assert num_params < 1000000, (
+                f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
+            )
 
 
 global_rng = random.Random()
diff --git a/tests/utils.py b/tests/utils.py
index 849a3047de..f8de616149 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -24,22 +24,22 @@
         ("bert-large-uncased-whole-word-masking", "Habana/bert-large-uncased-whole-word-masking"),
     ],
     "roberta": [
-        ("roberta-base", "Habana/roberta-base"),
+        # ("roberta-base", "Habana/roberta-base"),
         ("roberta-large", "Habana/roberta-large"),
     ],
     "albert": [
         ("albert-large-v2", "Habana/albert-large-v2"),
-        ("albert-xxlarge-v1", "Habana/albert-xxlarge-v1"),
+        # ("albert-xxlarge-v1", "Habana/albert-xxlarge-v1"),
     ],
     "distilbert": [
-        ("distilbert-base-uncased", "Habana/distilbert-base-uncased"),
+        # ("distilbert-base-uncased", "Habana/distilbert-base-uncased"),
     ],
     "gpt2": [
-        ("gpt2", "Habana/gpt2"),
+        # ("gpt2", "Habana/gpt2"),
         ("gpt2-xl", "Habana/gpt2"),
     ],
     "t5": [
-        ("t5-small", "Habana/t5"),
+        # ("t5-small", "Habana/t5"),
         ("google/flan-t5-xxl", "Habana/t5"),
     ],
     "vit": [
@@ -47,7 +47,7 @@
     ],
     "wav2vec2": [
         ("facebook/wav2vec2-base", "Habana/wav2vec2"),
-        ("facebook/wav2vec2-large-lv60", "Habana/wav2vec2"),
+        # ("facebook/wav2vec2-large-lv60", "Habana/wav2vec2"),
     ],
     "swin": [("microsoft/swin-base-patch4-window7-224-in22k", "Habana/swin")],
     "clip": [("./clip-roberta", "Habana/clip")],
@@ -68,10 +68,10 @@
 }
 
 MODELS_TO_TEST_FOR_QUESTION_ANSWERING = [
-    "bert",
+    # "bert",
     "roberta",
-    "albert",
-    "distilbert",
+    # "albert",
+    # "distilbert",
 ]
 
 # Only BERT has been officially validated for sequence classification