From 537c2f0d926936d9d56d6957e6fd4e825edae37b Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 3 Nov 2023 15:59:12 -0700 Subject: [PATCH 1/5] WIP --- ludwig/models/llm.py | 20 +++++++++++++++----- requirements.txt | 3 ++- requirements_extra.txt | 2 +- tests/integration_tests/test_llm.py | 7 ++++++- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py index a0d3c971a60..00a5156939a 100644 --- a/ludwig/models/llm.py +++ b/ludwig/models/llm.py @@ -366,12 +366,22 @@ def forward( input_ids, target_ids, self.tokenizer, self.global_max_sequence_length ) + # TODO: ALEX # Wrap with flash attention backend for faster generation - with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if ( - torch.cuda.is_available() and self.curr_device.type == "cuda" - ) else contextlib.nullcontext(): - # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass - model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) + # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if ( + # torch.cuda.is_available() and self.curr_device.type == "cuda" + # ) else contextlib.nullcontext(): + # # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass + # model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) + # TODO: ALEX + # TODO: ALEX + if torch.cuda.is_available(): + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) + else: + with contextlib.nullcontext(): + model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) + # TODO: ALEX if self.output_feature_type != TEXT: # Pass generated tokens through decoder after averaging the token probabilities diff --git a/requirements.txt b/requirements.txt index a6df2690e88..840cc0c3463 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,8 @@ torchaudio torchtext torchvision pydantic<2.0 -transformers>=4.33.2 +#transformers>=4.33.2 +transformers>=4.33.2,<4.35.0 tokenizers>=0.13.3 spacy>=2.3 PyYAML>=3.12,<6.0.1,!=5.4.* #Exlude PyYAML 5.4.* due to incompatibility with awscli diff --git a/requirements_extra.txt b/requirements_extra.txt index 26fe48eb998..f56ff5070a1 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -1,5 +1,5 @@ # requirements for horovod -horovod[pytorch]>=0.24.0,!=0.26.0 +#horovod[pytorch]>=0.24.0,!=0.26.0 # alternative to Dask modin[ray] diff --git a/tests/integration_tests/test_llm.py b/tests/integration_tests/test_llm.py index 6962a7ef9ce..8f278c19ef0 100644 --- a/tests/integration_tests/test_llm.py +++ b/tests/integration_tests/test_llm.py @@ -459,6 +459,7 @@ def _verify_lm_lora_finetuning_layers( @pytest.mark.parametrize( "finetune_strategy,adapter_args", [ + # TODO: ALEX pytest.param( None, {}, @@ -504,16 +505,20 @@ def _verify_lm_lora_finetuning_layers( {POSTPROCESSOR: {MERGE_ADAPTER_INTO_BASE_MODEL: False}}, id="adalora_not_merged", ), + # TODO: ALEX + # TODO: ALEX pytest.param( "adaption_prompt", {}, id="adaption_prompt-defaults", ), + # TODO: ALEX pytest.param( "adaption_prompt", {"adapter_len": 6, "adapter_layers": 1}, id="adaption_prompt-modified-defaults", ), + # TODO: ALEX # pytest.param( # "prompt_tuning", # { @@ -761,7 +766,7 @@ def test_llm_lora_finetuning_merge_and_unload_4_bit_quantization_not_supported(l "config.json", "generation_config.json", "merges.txt", - "pytorch_model.bin", + "model.safetensors", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", From fe4f5f1c677e27a860da9515391544759df0df15 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 3 Nov 2023 16:05:18 -0700 Subject: [PATCH 2/5] Pinning Transformers to not include version 4.35.0 because it breaks a method in PEFT. --- ludwig/models/llm.py | 20 +++++--------------- requirements.txt | 3 +-- requirements_extra.txt | 2 +- tests/integration_tests/test_llm.py | 5 ----- 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py index 00a5156939a..a0d3c971a60 100644 --- a/ludwig/models/llm.py +++ b/ludwig/models/llm.py @@ -366,22 +366,12 @@ def forward( input_ids, target_ids, self.tokenizer, self.global_max_sequence_length ) - # TODO: ALEX # Wrap with flash attention backend for faster generation - # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if ( - # torch.cuda.is_available() and self.curr_device.type == "cuda" - # ) else contextlib.nullcontext(): - # # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass - # model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) - # TODO: ALEX - # TODO: ALEX - if torch.cuda.is_available(): - with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): - model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) - else: - with contextlib.nullcontext(): - model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) - # TODO: ALEX + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if ( + torch.cuda.is_available() and self.curr_device.type == "cuda" + ) else contextlib.nullcontext(): + # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass + model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS) if self.output_feature_type != TEXT: # Pass generated tokens through decoder after averaging the token probabilities diff --git a/requirements.txt b/requirements.txt index 840cc0c3463..edeff450f6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,8 +11,7 @@ torchaudio torchtext torchvision pydantic<2.0 -#transformers>=4.33.2 -transformers>=4.33.2,<4.35.0 +transformers>=4.33.2,<4.35.0 # pinning since version 4.35.0 on 11/2/2023 causes IndexError in PEFT tokenizers>=0.13.3 spacy>=2.3 PyYAML>=3.12,<6.0.1,!=5.4.* #Exlude PyYAML 5.4.* due to incompatibility with awscli diff --git a/requirements_extra.txt b/requirements_extra.txt index f56ff5070a1..26fe48eb998 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -1,5 +1,5 @@ # requirements for horovod -#horovod[pytorch]>=0.24.0,!=0.26.0 +horovod[pytorch]>=0.24.0,!=0.26.0 # alternative to Dask modin[ray] diff --git a/tests/integration_tests/test_llm.py b/tests/integration_tests/test_llm.py index 8f278c19ef0..cd0430cc41e 100644 --- a/tests/integration_tests/test_llm.py +++ b/tests/integration_tests/test_llm.py @@ -459,7 +459,6 @@ def _verify_lm_lora_finetuning_layers( @pytest.mark.parametrize( "finetune_strategy,adapter_args", [ - # TODO: ALEX pytest.param( None, {}, @@ -505,20 +504,16 @@ def _verify_lm_lora_finetuning_layers( {POSTPROCESSOR: {MERGE_ADAPTER_INTO_BASE_MODEL: False}}, id="adalora_not_merged", ), - # TODO: ALEX - # TODO: ALEX pytest.param( "adaption_prompt", {}, id="adaption_prompt-defaults", ), - # TODO: ALEX pytest.param( "adaption_prompt", {"adapter_len": 6, "adapter_layers": 1}, id="adaption_prompt-modified-defaults", ), - # TODO: ALEX # pytest.param( # "prompt_tuning", # { From 2f562ae76b3cf1d79cbf7fb036105762619de3a0 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 3 Nov 2023 16:09:57 -0700 Subject: [PATCH 3/5] Revert to "pytorch_model.bin", for Transformers version 4.34.1 -- it will change to "model.safetensors" for versions 4.35.0 and greater. --- tests/integration_tests/test_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_llm.py b/tests/integration_tests/test_llm.py index cd0430cc41e..2005e3448d0 100644 --- a/tests/integration_tests/test_llm.py +++ b/tests/integration_tests/test_llm.py @@ -761,7 +761,7 @@ def test_llm_lora_finetuning_merge_and_unload_4_bit_quantization_not_supported(l "config.json", "generation_config.json", "merges.txt", - "model.safetensors", + "pytorch_model.bin", # If Transformers >4.34.1 is installed with PEFT 0.6.0, use "model.safetensors". "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", From 7a4ced197d8797293d4fb655f44df600049d849b Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 3 Nov 2023 16:12:09 -0700 Subject: [PATCH 4/5] Revert to "pytorch_model.bin", for Transformers version 4.34.1 -- it will change to "model.safetensors" for versions 4.35.0 and greater. --- requirements_llm.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/requirements_llm.txt b/requirements_llm.txt index 8ae6f0299dd..f120b04ac98 100644 --- a/requirements_llm.txt +++ b/requirements_llm.txt @@ -4,6 +4,4 @@ faiss-cpu accelerate loralib -# Temporarily pin PEFT to PEFT master for Mistral-7b support: -# https://github.com/ludwig-ai/ludwig/issues/3724 -peft @ git+https://github.com/huggingface/peft.git@07f2b82 +peft From 8daf73fcc8384596449c1f0dc7cd7d10ec949466 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 3 Nov 2023 17:52:04 -0700 Subject: [PATCH 5/5] Empty commit (in order for Azure CI checks to rerun).