huggingface · younesbelkada · Jan 17, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/.github/workflows/slow-tests.yml b/.github/workflows/slow-tests.yml
@@ -2,15 +2,11 @@ name: Slow tests (on push)
 
 on:
   push:
-    branches: [ main ]
-    paths:
-      # Run only when python files are modified
-      - "trl/**.py"
-      - "examples/**.py"
+    branches: [ add-slow-tests ]
 env:
   RUN_SLOW: "yes"
   IS_GITHUB_CI: "1"
-  SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
 
 
 jobs:
@@ -34,18 +30,20 @@ jobs:
       - name: Pip install
         run: |
           source activate trl
-          pip install -e . --no-deps
-          pip install pytest-reportlog
-
-      - name: Run common tests on single GPU
+          pip install -e ".[test]" --no-deps
+          pip install pytest-reportlog parameterized
+
+      - name: Run slow SFT tests on single GPU
+        if: always()
         run: |
           source activate trl
-          make tests_common_gpu
+          make slow_sft_tests
 
-      - name: Run slow tests on single GPU
+      - name: Run slow DPO tests on single GPU
+        if: always()
         run: |
           source activate trl
-          make slow_tests_single_gpu
+          make slow_dpo_tests
 
       - name: Generate Report
         if: always()
@@ -74,24 +72,35 @@ jobs:
       - name: Pip install
         run: |
           source activate trl
-          pip install -e . --no-deps
-          pip install pytest-reportlog
-
-      - name: Run common tests on single GPU
+          pip install -e ".[test]" --no-deps
+          pip install pytest-reportlog parameterized
+
+      - name: Run slow SFT tests on single GPU
+        if: always()
         run: |
           source activate trl
-          make tests_common_gpu
+          make slow_sft_tests
 
-      - name: Run slow tests on multi GPU
+      - name: Run slow DPO tests on single GPU
+        if: always()
         run: |
           source activate trl
-          make slow_tests_multi_gpu
+          make slow_dpo_tests
 
       - name: Run end-to-end SFT examples tests on multi GPU
+        if: always()
         run: |
           source activate trl
+          pip install deepspeed
           make run_sft_examples
 
+      - name: Run end-to-end DPO examples tests on multi GPU
+        if: always()
+        run: |
+          source activate trl
+          pip install deepspeed
+          make run_dpo_examples
+
       - name: Generate Reports
         if: always()
         run: |

diff --git a/Makefile b/Makefile
@@ -1,7 +1,10 @@
-.PHONY: test precommit benchmark_core benchmark_aux
+.PHONY: test precommit benchmark_core benchmark_aux common_tests slow_sft_tests slow_dpotests run_sft_examples run_dpo_examples
 
 check_dirs := examples tests trl
 
+ACCELERATE_CONFIG_PATH = `pwd`/examples/accelerate_configs
+COMMAND_FILES_PATH = `pwd`/commands
+
 test:
 	python -m pytest -n auto --dist=loadfile -s -v ./tests/
 
@@ -13,3 +16,26 @@ benchmark_core:
 
 benchmark_aux:
 	bash ./benchmark/benchmark_aux.sh
+
+tests_common_gpu:
+	python -m pytest tests/test_* $(if $(IS_GITHUB_CI),--report-log "common_tests.log",)
+
+slow_sft_tests:
+	python -m pytest tests/slow/test_sft_slow.py $(if $(IS_GITHUB_CI),--report-log "sft_slow.log",)
+
+slow_dpo_tests:
+	python -m pytest tests/slow/test_dpo_slow.py $(if $(IS_GITHUB_CI),--report-log "dpo_slow.log",)
+
+run_sft_examples:
+	touch temp_results_sft_tests.txt
+	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
+		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_sft.sh; \
+		echo $$?','$${file} >> temp_results_sft_tests.txt; \
+	done
+
+run_dpo_examples:
+	touch temp_results_dpo_tests.txt
+	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
+		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_dpo.sh; \
+		echo $$?','$${file} >> temp_results_dpo_tests.txt; \
+	done
diff --git a/commands/run_dpo.sh b/commands/run_dpo.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# This script runs an SFT example end-to-end on a tiny model using different possible configurations
+# but defaults to QLoRA + PEFT
+OUTPUT_DIR="test_dpo/"
+MODEL_NAME="HuggingFaceM4/tiny-random-LlamaForCausalLM"
+MAX_STEPS=5
+BATCH_SIZE=2
+SEQ_LEN=128
+
+# Handle extra arguments in case one passes accelerate configs.
+EXTRA_ACCELERATE_ARGS=""
+EXTRA_TRAINING_ARGS="""--use_peft \
+    --load_in_4bit
+"""
+
+# This is a hack to get the number of available GPUs
+mapfile -t num_gpus < <(nvidia-smi --format=csv --query-gpu=index | tail -n+2 | wc -l)
+NUM_GPUS=${num_gpus[0]}
+
+if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
+  EXTRA_ACCELERATE_ARGS=""
+else
+  EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
+  # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
+  # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
+  if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
+    EXTRA_TRAINING_ARGS="--fp16"
+  else
+    echo "Keeping QLoRA + PEFT"
+  fi
+fi
+
+
+CMD="""
+accelerate launch $EXTRA_ACCELERATE_ARGS \
+    --num_processes $NUM_GPUS \
+    `pwd`/examples/scripts/dpo.py \
+    --model_name_or_path $MODEL_NAME \
+    --output_dir $OUTPUT_DIR \
+    --max_steps $MAX_STEPS \
+    --per_device_train_batch_size $BATCH_SIZE \
+    --max_length $SEQ_LEN \
+    $EXTRA_TRAINING_ARGS
+"""
+
+echo "Starting program..."
+
+{ # try
+    echo $CMD
+    eval "$CMD"
+} || { # catch
+    # save log for exception 
+    echo "Operation Failed!"
+    exit 1
+}
+exit 0
diff --git a/commands/run_sft.sh b/commands/run_sft.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# This script runs an SFT example end-to-end on a tiny model using different possible configurations
+# but defaults to QLoRA + PEFT
+OUTPUT_DIR="test_sft/"
+MODEL_NAME="HuggingFaceM4/tiny-random-LlamaForCausalLM"
+DATASET_NAME="imdb"
+MAX_STEPS=5
+BATCH_SIZE=2
+SEQ_LEN=128
+
+# Handle extra arguments in case one passes accelerate configs.
+EXTRA_ACCELERATE_ARGS=""
+EXTRA_TRAINING_ARGS="""--use_peft \
+    --load_in_4bit
+"""
+
+# This is a hack to get the number of available GPUs
+mapfile -t num_gpus < <(nvidia-smi --format=csv --query-gpu=index | tail -n+2 | wc -l)
+NUM_GPUS=${num_gpus[0]}
+
+if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
+  EXTRA_ACCELERATE_ARGS=""
+else
+  EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
+  # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
+  # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
+  if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
+    EXTRA_TRAINING_ARGS="--fp16"
+  else
+    echo "Keeping QLoRA + PEFT"
+  fi
+fi
+
+
+CMD="""
+accelerate launch $EXTRA_ACCELERATE_ARGS \
+    --num_processes $NUM_GPUS \
+    `pwd`/examples/scripts/sft.py \
+    --model_name $MODEL_NAME \
+    --dataset_name $DATASET_NAME \
+    --output_dir $OUTPUT_DIR \
+    --max_steps $MAX_STEPS \
+    --batch_size $BATCH_SIZE \
+    --seq_length $SEQ_LEN \
+    $EXTRA_TRAINING_ARGS
+"""
+
+echo "Starting program..."
+
+{ # try
+    echo $CMD
+    eval "$CMD"
+} || { # catch
+    # save log for exception 
+    echo "Operation Failed!"
+    exit 1
+}
+exit 0
diff --git a/docker/trl-latest-gpu/Dockerfile b/docker/trl-latest-gpu/Dockerfile
@@ -55,7 +55,7 @@ RUN source activate trl && \
     transformers \
     accelerate \
     peft \
-    trl
+    trl[test]@git+https://github.com/huggingface/trl
 
 RUN source activate trl && \ 
     pip freeze | grep trl

diff --git a/examples/accelerate_configs/deepspeed_zero1.yaml b/examples/accelerate_configs/deepspeed_zero1.yaml
@@ -9,7 +9,7 @@ distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
-mixed_precision: 'bf16'
+mixed_precision: 'fp16'
 num_machines: 1
 num_processes: 8
 rdzv_backend: static

diff --git a/examples/accelerate_configs/deepspeed_zero2.yaml b/examples/accelerate_configs/deepspeed_zero2.yaml
@@ -11,7 +11,7 @@ distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
-mixed_precision: 'bf16'
+mixed_precision: 'fp16'
 num_machines: 1
 num_processes: 8
 rdzv_backend: static

diff --git a/examples/accelerate_configs/deepspeed_zero3.yaml b/examples/accelerate_configs/deepspeed_zero3.yaml
@@ -12,7 +12,7 @@ distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
-mixed_precision: 'bf16'
+mixed_precision: 'fp16'
 num_machines: 1
 num_processes: 8
 rdzv_backend: static

diff --git a/examples/accelerate_configs/single_gpu.yaml b/examples/accelerate_configs/single_gpu.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: "NO"
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
@@ -21,11 +21,12 @@
 from typing import Dict, Optional
 
 import torch
+from accelerate import PartialState
 from datasets import Dataset, load_dataset
 from peft import LoraConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
 
-from trl import DPOTrainer
+from trl import DPOTrainer, is_xpu_available
 
 
 # Define and parse arguments.
@@ -45,6 +46,13 @@ class ScriptArguments:
     gradient_accumulation_steps: Optional[int] = field(
         default=1, metadata={"help": "the number of gradient accumulation steps"}
     )
+    output_dir: Optional[str] = field(default="output", metadata={"help": "the output directory"})
+    fp16: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to activate fp16 mixed precision during training"}
+    )
+    bf16: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to activate bf16 mixed precision during training"}
+    )
     max_length: Optional[int] = field(default=512, metadata={"help": "max length of each sample"})
     max_prompt_length: Optional[int] = field(default=128, metadata={"help": "max length of each sample's prompt"})
     max_target_length: Optional[int] = field(
@@ -83,6 +91,8 @@ class ScriptArguments:
             "help": "key word arguments to be passed along `torch.utils.checkpoint.checkpoint` method - e.g. `use_reentrant=False`"
         },
     )
+    load_in_8bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 8 bits precision"})
+    load_in_4bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 4 bits precision"})
 
 
 def extract_anthropic_prompt(prompt_and_response):
@@ -126,16 +136,43 @@ def split_prompt_and_responses(sample) -> Dict[str, str]:
     parser = HfArgumentParser(ScriptArguments)
     script_args = parser.parse_args_into_dataclasses()[0]
 
+    if script_args.load_in_8bit and script_args.load_in_4bit:
+        raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
+    elif script_args.load_in_8bit or script_args.load_in_4bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=script_args.load_in_8bit, load_in_4bit=script_args.load_in_4bit
+        )
+        # Copy the model to each device
+        device_map = (
+            {"": f"xpu:{PartialState().local_process_index}"}
+            if is_xpu_available()
+            else {"": PartialState().local_process_index}
+        )
+        torch_dtype = torch.bfloat16
+    else:
+        device_map = None
+        quantization_config = None
+        torch_dtype = None
+
     # 1. load a pretrained model
-    model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        script_args.model_name_or_path,
+        device_map=device_map,
+        quantization_config=quantization_config,
+        torch_dtype=torch_dtype,
+    )
 
     if script_args.ignore_bias_buffers:
         # torch distributed hack
         model._ddp_params_and_buffers_to_ignore = [
             name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
         ]
 
-    model_ref = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path)
+    if not script_args.use_peft:
+        model_ref = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path)
+    else:
+        # If one uses PEFT, there is no need to load a reference model
+        model_ref = None
 
     tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)
     if tokenizer.pad_token is None:
@@ -158,11 +195,12 @@ def split_prompt_and_responses(sample) -> Dict[str, str]:
         logging_first_step=True,
         logging_steps=10,  # match results in blog post
         eval_steps=500,
-        output_dir="./test",
+        output_dir=script_args.output_dir,
         optim="rmsprop",
         warmup_steps=150,
         report_to=script_args.report_to,
-        bf16=True,
+        bf16=script_args.bf16,
+        fp16=script_args.fp16,
         gradient_checkpointing=script_args.gradient_checkpointing,
         # TODO: uncomment that on the next transformers release
         # gradient_checkpointing_kwargs=script_args.gradient_checkpointing_kwargs,