From 537c2f0d926936d9d56d6957e6fd4e825edae37b Mon Sep 17 00:00:00 2001
From: Alex Sherstinsky <alexsherstinsky@users.noreply.github.com>
Date: Fri, 3 Nov 2023 15:59:12 -0700
Subject: [PATCH 1/5] WIP

---
 ludwig/models/llm.py                | 20 +++++++++++++++-----
 requirements.txt                    |  3 ++-
 requirements_extra.txt              |  2 +-
 tests/integration_tests/test_llm.py |  7 ++++++-
 4 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py
index a0d3c971a60..00a5156939a 100644
--- a/ludwig/models/llm.py
+++ b/ludwig/models/llm.py
@@ -366,12 +366,22 @@ def forward(
             input_ids, target_ids, self.tokenizer, self.global_max_sequence_length
         )
 
+        # TODO: <Alex>ALEX</Alex>
         # Wrap with flash attention backend for faster generation
-        with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if (
-            torch.cuda.is_available() and self.curr_device.type == "cuda"
-        ) else contextlib.nullcontext():
-            # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass
-            model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
+        # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if (
+        #     torch.cuda.is_available() and self.curr_device.type == "cuda"
+        # ) else contextlib.nullcontext():
+        #     # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass
+        #     model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
+        # TODO: <Alex>ALEX</Alex>
+        # TODO: <Alex>ALEX</Alex>
+        if torch.cuda.is_available():
+            with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+                model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
+        else:
+            with contextlib.nullcontext():
+                model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
+        # TODO: <Alex>ALEX</Alex>
 
         if self.output_feature_type != TEXT:
             # Pass generated tokens through decoder after averaging the token probabilities
diff --git a/requirements.txt b/requirements.txt
index a6df2690e88..840cc0c3463 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,8 @@ torchaudio
 torchtext
 torchvision
 pydantic<2.0
-transformers>=4.33.2
+#transformers>=4.33.2
+transformers>=4.33.2,<4.35.0
 tokenizers>=0.13.3
 spacy>=2.3
 PyYAML>=3.12,<6.0.1,!=5.4.* #Exlude PyYAML 5.4.* due to incompatibility with awscli
diff --git a/requirements_extra.txt b/requirements_extra.txt
index 26fe48eb998..f56ff5070a1 100644
--- a/requirements_extra.txt
+++ b/requirements_extra.txt
@@ -1,5 +1,5 @@
 # requirements for horovod
-horovod[pytorch]>=0.24.0,!=0.26.0
+#horovod[pytorch]>=0.24.0,!=0.26.0
 
 # alternative to Dask
 modin[ray]
diff --git a/tests/integration_tests/test_llm.py b/tests/integration_tests/test_llm.py
index 6962a7ef9ce..8f278c19ef0 100644
--- a/tests/integration_tests/test_llm.py
+++ b/tests/integration_tests/test_llm.py
@@ -459,6 +459,7 @@ def _verify_lm_lora_finetuning_layers(
 @pytest.mark.parametrize(
     "finetune_strategy,adapter_args",
     [
+        # TODO: <Alex>ALEX</Alex>
         pytest.param(
             None,
             {},
@@ -504,16 +505,20 @@ def _verify_lm_lora_finetuning_layers(
             {POSTPROCESSOR: {MERGE_ADAPTER_INTO_BASE_MODEL: False}},
             id="adalora_not_merged",
         ),
+        # TODO: <Alex>ALEX</Alex>
+        # TODO: <Alex>ALEX</Alex>
         pytest.param(
             "adaption_prompt",
             {},
             id="adaption_prompt-defaults",
         ),
+        # TODO: <Alex>ALEX</Alex>
         pytest.param(
             "adaption_prompt",
             {"adapter_len": 6, "adapter_layers": 1},
             id="adaption_prompt-modified-defaults",
         ),
+        # TODO: <Alex>ALEX</Alex>
         # pytest.param(
         #     "prompt_tuning",
         #     {
@@ -761,7 +766,7 @@ def test_llm_lora_finetuning_merge_and_unload_4_bit_quantization_not_supported(l
                 "config.json",
                 "generation_config.json",
                 "merges.txt",
-                "pytorch_model.bin",
+                "model.safetensors",
                 "special_tokens_map.json",
                 "tokenizer.json",
                 "tokenizer_config.json",

From fe4f5f1c677e27a860da9515391544759df0df15 Mon Sep 17 00:00:00 2001
From: Alex Sherstinsky <alexsherstinsky@users.noreply.github.com>
Date: Fri, 3 Nov 2023 16:05:18 -0700
Subject: [PATCH 2/5] Pinning Transformers to not include version 4.35.0
 because it breaks a method in PEFT.

---
 ludwig/models/llm.py                | 20 +++++---------------
 requirements.txt                    |  3 +--
 requirements_extra.txt              |  2 +-
 tests/integration_tests/test_llm.py |  5 -----
 4 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py
index 00a5156939a..a0d3c971a60 100644
--- a/ludwig/models/llm.py
+++ b/ludwig/models/llm.py
@@ -366,22 +366,12 @@ def forward(
             input_ids, target_ids, self.tokenizer, self.global_max_sequence_length
         )
 
-        # TODO: <Alex>ALEX</Alex>
         # Wrap with flash attention backend for faster generation
-        # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if (
-        #     torch.cuda.is_available() and self.curr_device.type == "cuda"
-        # ) else contextlib.nullcontext():
-        #     # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass
-        #     model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
-        # TODO: <Alex>ALEX</Alex>
-        # TODO: <Alex>ALEX</Alex>
-        if torch.cuda.is_available():
-            with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
-                model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
-        else:
-            with contextlib.nullcontext():
-                model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
-        # TODO: <Alex>ALEX</Alex>
+        with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if (
+            torch.cuda.is_available() and self.curr_device.type == "cuda"
+        ) else contextlib.nullcontext():
+            # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass
+            model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
 
         if self.output_feature_type != TEXT:
             # Pass generated tokens through decoder after averaging the token probabilities
diff --git a/requirements.txt b/requirements.txt
index 840cc0c3463..edeff450f6a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,8 +11,7 @@ torchaudio
 torchtext
 torchvision
 pydantic<2.0
-#transformers>=4.33.2
-transformers>=4.33.2,<4.35.0
+transformers>=4.33.2,<4.35.0 # pinning since version 4.35.0 on 11/2/2023 causes IndexError in PEFT
 tokenizers>=0.13.3
 spacy>=2.3
 PyYAML>=3.12,<6.0.1,!=5.4.* #Exlude PyYAML 5.4.* due to incompatibility with awscli
diff --git a/requirements_extra.txt b/requirements_extra.txt
index f56ff5070a1..26fe48eb998 100644
--- a/requirements_extra.txt
+++ b/requirements_extra.txt
@@ -1,5 +1,5 @@
 # requirements for horovod
-#horovod[pytorch]>=0.24.0,!=0.26.0
+horovod[pytorch]>=0.24.0,!=0.26.0
 
 # alternative to Dask
 modin[ray]
diff --git a/tests/integration_tests/test_llm.py b/tests/integration_tests/test_llm.py
index 8f278c19ef0..cd0430cc41e 100644
--- a/tests/integration_tests/test_llm.py
+++ b/tests/integration_tests/test_llm.py
@@ -459,7 +459,6 @@ def _verify_lm_lora_finetuning_layers(
 @pytest.mark.parametrize(
     "finetune_strategy,adapter_args",
     [
-        # TODO: <Alex>ALEX</Alex>
         pytest.param(
             None,
             {},
@@ -505,20 +504,16 @@ def _verify_lm_lora_finetuning_layers(
             {POSTPROCESSOR: {MERGE_ADAPTER_INTO_BASE_MODEL: False}},
             id="adalora_not_merged",
         ),
-        # TODO: <Alex>ALEX</Alex>
-        # TODO: <Alex>ALEX</Alex>
         pytest.param(
             "adaption_prompt",
             {},
             id="adaption_prompt-defaults",
         ),
-        # TODO: <Alex>ALEX</Alex>
         pytest.param(
             "adaption_prompt",
             {"adapter_len": 6, "adapter_layers": 1},
             id="adaption_prompt-modified-defaults",
         ),
-        # TODO: <Alex>ALEX</Alex>
         # pytest.param(
         #     "prompt_tuning",
         #     {

From 2f562ae76b3cf1d79cbf7fb036105762619de3a0 Mon Sep 17 00:00:00 2001
From: Alex Sherstinsky <alexsherstinsky@users.noreply.github.com>
Date: Fri, 3 Nov 2023 16:09:57 -0700
Subject: [PATCH 3/5] Revert to "pytorch_model.bin", for Transformers version
 4.34.1 -- it will change to "model.safetensors" for versions 4.35.0 and
 greater.

---
 tests/integration_tests/test_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests/test_llm.py b/tests/integration_tests/test_llm.py
index cd0430cc41e..2005e3448d0 100644
--- a/tests/integration_tests/test_llm.py
+++ b/tests/integration_tests/test_llm.py
@@ -761,7 +761,7 @@ def test_llm_lora_finetuning_merge_and_unload_4_bit_quantization_not_supported(l
                 "config.json",
                 "generation_config.json",
                 "merges.txt",
-                "model.safetensors",
+                "pytorch_model.bin",  # If Transformers >4.34.1 is installed with PEFT 0.6.0, use "model.safetensors".
                 "special_tokens_map.json",
                 "tokenizer.json",
                 "tokenizer_config.json",

From 7a4ced197d8797293d4fb655f44df600049d849b Mon Sep 17 00:00:00 2001
From: Alex Sherstinsky <alexsherstinsky@users.noreply.github.com>
Date: Fri, 3 Nov 2023 16:12:09 -0700
Subject: [PATCH 4/5] Revert to "pytorch_model.bin", for Transformers version
 4.34.1 -- it will change to "model.safetensors" for versions 4.35.0 and
 greater.

---
 requirements_llm.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/requirements_llm.txt b/requirements_llm.txt
index 8ae6f0299dd..f120b04ac98 100644
--- a/requirements_llm.txt
+++ b/requirements_llm.txt
@@ -4,6 +4,4 @@ faiss-cpu
 accelerate
 loralib
 
-# Temporarily pin PEFT to PEFT master for Mistral-7b support:
-# https://github.com/ludwig-ai/ludwig/issues/3724
-peft @ git+https://github.com/huggingface/peft.git@07f2b82
+peft

From 8daf73fcc8384596449c1f0dc7cd7d10ec949466 Mon Sep 17 00:00:00 2001
From: Alex Sherstinsky <alexsherstinsky@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:52:04 -0700
Subject: [PATCH 5/5] Empty commit (in order for Azure CI checks to rerun).