Merge branch 'master' into tohtana/pp_dynamic_shape

microsoft · Aug 15, 2024 · 33453bf · 33453bf
2 parents 7fe47d0 + 4ba49dd
commit 33453bf
Show file tree

Hide file tree

Showing 28 changed files with 849 additions and 227 deletions.
diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
@@ -39,13 +39,14 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+      image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
 
     env:
       PT_HPU_LAZY_MODE: 0
+      TORCHINDUCTOR_COMPILE_THREADS: 1
       TEST_LIST: |
         test_accelerator.py
         test_autotuning.py
@@ -103,7 +104,7 @@ jobs:
       - name: Check container state
         run: |
           ldd --version
-          hl-smi
+          hl-smi -L
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -128,7 +129,7 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
           export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
+          export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
           TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
           echo "TEST_LIST ${TEST_LIST}"
-          echo "PT_HPU_LAZY_MODE ${PT_HPU_LAZY_MODE}"
           pytest --verbose unit/ -k "${TEST_LIST}"
diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
@@ -36,47 +36,47 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, intel, xpu]
     container:
-      image: intel/intel-extension-for-pytorch:2.1.30-xpu
+      image: intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
       ports:
         - 80
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
 
     steps:
     - uses: actions/checkout@v4
-    - name: Check container state
-      shell: bash
-      run: |
-        ldd --version
-        python -c "import torch; print('torch:', torch.__version__, torch)"
-        python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
-
-    - name: Install deepspeed
+    - name: Install prerequisite
       run: |
-        pip install py-cpuinfo
+        apt-get update
+        apt-get install clinfo libaio-dev python3-pip -y
+        pip install torch==2.1.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install intel-extension-for-pytorch==2.1.30+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install intel-extension-for-pytorch-deepspeed==2.1.30 -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install oneccl_bind_pt==2.1.300+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install torchvision==0.16.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install py-cpuinfo numpy==1.26
         pip install .[dev,autotuning]
-        ds_report
-        python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
 
-    - name: Python environment
+    - name: Check container state
       run: |
+        ldd --version
+        ds_report
+        python3 -c "import torch; print('torch:', torch.__version__, torch)"
+        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
         pip list
 
     - name: Unit tests
       run: |
-        pip install pytest pytest-timeout tabulate tensorboard wandb
-        export ONEAPI_ROOT=/opt/intel/oneapi/redist
-        export FI_PROVIDER_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib/prov
-        export LD_LIBRARY_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib:$LD_LIBRARY_PATH
-        export LD_LIBRARY_PATH=$ONEAPI_ROOT/lib:$LD_LIBRARY_PATH
         cd tests/unit
         pytest --verbose accelerator/*
         pytest --verbose autotuning/*
         pytest --verbose checkpoint/test_reshape_checkpoint.py
         pytest --verbose checkpoint/test_moe_checkpoint.py
         pytest --verbose checkpoint/test_shared_weights.py
         pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
+        pytest --verbose model_parallelism/*
         pytest --verbose moe/test_moe_tp.py
         pytest --verbose monitor/*
+        pytest --verbose utils/*
         pytest --verbose runtime/test_ds_config_model.py
         pytest --verbose runtime/pipe/test_pipe_schedule.py
         pytest --verbose runtime/zero/test_zero_config.py

diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
@@ -298,7 +298,11 @@ def export_envs(self):
         return []
 
     def visible_devices_envs(self):
-        return ['HABANA_VISIBLE_MODULES']
+        # Current way deepspeed set this env var is not applicable with all HPU instances
+        # User has to follow instructions in:
+        # https://docs.habana.ai/en/latest/PyTorch/Reference/PT_Multiple_Tenants_on_HPU/Multiple_Workloads_Single_Docker.html
+        # keeping CUDA_VISIBLE_DEVICES
+        return ['CUDA_VISIBLE_DEVICES']  #['HABANA_VISIBLE_MODULES']
 
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():

diff --git a/deepspeed/linear/__init__.py b/deepspeed/linear/__init__.py
@@ -5,3 +5,4 @@
 
 from .optimized_linear import OptimizedLinear
 from .config import LoRAConfig, QuantizationConfig
+from .context_manager import Init, init_lora
diff --git a/deepspeed/linear/config.py b/deepspeed/linear/config.py
@@ -3,7 +3,8 @@
 
 # DeepSpeed Team
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import List
 
 
 @dataclass
@@ -17,10 +18,19 @@ class LoRAConfig:
         base_weight_sharding (int): The degree to which the base weights are sharded,
             should typically be set to the data-parallel world size to maximize the memory
             reduction benefits. Defaults to 1, which means this feature is disabled.
+        offload (bool): offload frozen parameters to cpu when not in use
+        offload_ratio (float): ratio of parameters to offload to cpu when not in use
+        delay_lora_init (bool): initialize lora parameters at time of model init or allow manual init later
+        target_mods (str): target module names to apply LoRA to, defaults to llama-3.1 arch
     """
     lora_r: int = 64
     lora_alpha: float = 16.
     base_weight_sharding: int = 1
+    offload: bool = False
+    offload_ratio: float = 0.0
+    delay_lora_init: bool = False
+    target_mods: List[str] = field(
+        default_factory=lambda: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'])
 
 
 @dataclass

diff --git a/deepspeed/linear/context_manager.py b/deepspeed/linear/context_manager.py
@@ -0,0 +1,90 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .optimized_linear import LoRAOptimizedLinear, OptimizedLinear
+
+import torch
+
+try:
+    import transformers
+except ImportError:
+    transformers = None
+
+
+def init_lora(model):
+    model.requires_grad_(False)
+    for m in model.modules():
+        if isinstance(m, LoRAOptimizedLinear):
+            m.init_lora()
+
+
+class Init(object):
+    """
+    Init context wrapper similar in style to zero.Init. Allows for injecting OptimizedLinear during model
+    construction which will shard base weights and reduce overall memory usage during model init. Primarily
+    useful when initializing a model via transformers.AutoModelForCausalLM.
+
+    Example usage:
+        lora_config = deepspeed.linear.LoRAConfig(..)
+        quant_config = deepspeed.linear.QuantizationConfig(..)
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B")
+
+    """
+
+    def __init__(self, lora_config=None, quant_config=None):
+        self._orig_nn_linear = torch.nn.Linear
+        self._orig_causallm_pretrained = None
+        if transformers != None:
+            self._orig_causallm_pretrained = transformers.AutoModelForCausalLM.from_pretrained
+            self._orig_causallm_config = transformers.AutoModelForCausalLM.from_config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+        self._post_init_complete = False
+
+    def __enter__(self):
+
+        class OptLinearWrapper:
+            _orig_nn_linear = self._orig_nn_linear
+            _lora_config = self.lora_config
+            _quant_config = self.quant_config
+
+            def __new__(self, *args, **kwargs):
+                self._lora_config.delay_lora_init = True
+                kwargs['lora_config'] = self._lora_config
+                kwargs['quantization_config'] = self._quant_config
+                kwargs['linear_cls'] = self._orig_nn_linear
+                return OptimizedLinear(*args, **kwargs)
+
+        def _model_init(model):
+            if self.lora_config != None:
+                init_lora(model)
+            self._post_init_complete = True
+            return model
+
+        # ensures non-lora params are frozen and lora weights are initialized
+        def from_pretrained(*args, **kwargs):
+            model = self._orig_causallm_pretrained(*args, **kwargs)
+            return _model_init(model)
+
+        def from_config(*args, **kwargs):
+            model = self._orig_causallm_config(*args, **kwargs)
+            return _model_init(model)
+
+        torch.nn.Linear = OptLinearWrapper
+        if transformers != None:
+            transformers.AutoModelForCausalLM.from_pretrained = from_pretrained
+            transformers.AutoModelForCausalLM.from_config = from_config
+
+    def __exit__(self, *args, **kwargs):
+        torch.nn.Linear = self._orig_nn_linear
+        if not self._post_init_complete:
+            print('WARNING: For some reason LoRA modules are not initialized, this is usually done automatically '
+                  'if using transformers via (AutoModelForCausalLM from_pretrained/from_config). '
+                  'You must call `init_lora` on each module in order to use DeepSpeed LoRA, otherwise '
+                  'you will error out during runtime.')
+        else:
+            transformers.AutoModelForCausalLM.from_pretrained = self._orig_causallm_pretrained
+            transformers.AutoModelForCausalLM.from_config = self._orig_causallm_config