Skip to content

Commit

Permalink
Merge branch 'master' into tohtana/pp_dynamic_shape
Browse files Browse the repository at this point in the history
  • Loading branch information
tohtana authored Aug 15, 2024
2 parents 7fe47d0 + 4ba49dd commit 33453bf
Show file tree
Hide file tree
Showing 28 changed files with 849 additions and 227 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/hpu-gaudi2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,14 @@ jobs:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

env:
PT_HPU_LAZY_MODE: 0
TORCHINDUCTOR_COMPILE_THREADS: 1
TEST_LIST: |
test_accelerator.py
test_autotuning.py
Expand Down Expand Up @@ -103,7 +104,7 @@ jobs:
- name: Check container state
run: |
ldd --version
hl-smi
hl-smi -L
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -128,7 +129,7 @@ jobs:
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
echo "TEST_LIST ${TEST_LIST}"
echo "PT_HPU_LAZY_MODE ${PT_HPU_LAZY_MODE}"
pytest --verbose unit/ -k "${TEST_LIST}"
36 changes: 18 additions & 18 deletions .github/workflows/xpu-max1100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,47 +36,47 @@ jobs:
unit-tests:
runs-on: [self-hosted, intel, xpu]
container:
image: intel/intel-extension-for-pytorch:2.1.30-xpu
image: intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
ports:
- 80
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL

steps:
- uses: actions/checkout@v4
- name: Check container state
shell: bash
run: |
ldd --version
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
- name: Install deepspeed
- name: Install prerequisite
run: |
pip install py-cpuinfo
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
pip install torch==2.1.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
pip install intel-extension-for-pytorch==2.1.30+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
pip install intel-extension-for-pytorch-deepspeed==2.1.30 -f https://developer.intel.com/ipex-whl-stable-xpu
pip install oneccl_bind_pt==2.1.300+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
pip install torchvision==0.16.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
pip install py-cpuinfo numpy==1.26
pip install .[dev,autotuning]
ds_report
python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
- name: Python environment
- name: Check container state
run: |
ldd --version
ds_report
python3 -c "import torch; print('torch:', torch.__version__, torch)"
python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
pip list
- name: Unit tests
run: |
pip install pytest pytest-timeout tabulate tensorboard wandb
export ONEAPI_ROOT=/opt/intel/oneapi/redist
export FI_PROVIDER_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib/prov
export LD_LIBRARY_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$ONEAPI_ROOT/lib:$LD_LIBRARY_PATH
cd tests/unit
pytest --verbose accelerator/*
pytest --verbose autotuning/*
pytest --verbose checkpoint/test_reshape_checkpoint.py
pytest --verbose checkpoint/test_moe_checkpoint.py
pytest --verbose checkpoint/test_shared_weights.py
pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
pytest --verbose model_parallelism/*
pytest --verbose moe/test_moe_tp.py
pytest --verbose monitor/*
pytest --verbose utils/*
pytest --verbose runtime/test_ds_config_model.py
pytest --verbose runtime/pipe/test_pipe_schedule.py
pytest --verbose runtime/zero/test_zero_config.py
Expand Down
6 changes: 5 additions & 1 deletion accelerator/hpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,11 @@ def export_envs(self):
return []

def visible_devices_envs(self):
return ['HABANA_VISIBLE_MODULES']
# Current way deepspeed set this env var is not applicable with all HPU instances
# User has to follow instructions in:
# https://docs.habana.ai/en/latest/PyTorch/Reference/PT_Multiple_Tenants_on_HPU/Multiple_Workloads_Single_Docker.html
# keeping CUDA_VISIBLE_DEVICES
return ['CUDA_VISIBLE_DEVICES'] #['HABANA_VISIBLE_MODULES']

def set_visible_devices_envs(self, current_env, local_accelerator_ids):
for env in self.visible_devices_envs():
Expand Down
1 change: 1 addition & 0 deletions deepspeed/linear/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@

from .optimized_linear import OptimizedLinear
from .config import LoRAConfig, QuantizationConfig
from .context_manager import Init, init_lora
12 changes: 11 additions & 1 deletion deepspeed/linear/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

# DeepSpeed Team

from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import List


@dataclass
Expand All @@ -17,10 +18,19 @@ class LoRAConfig:
base_weight_sharding (int): The degree to which the base weights are sharded,
should typically be set to the data-parallel world size to maximize the memory
reduction benefits. Defaults to 1, which means this feature is disabled.
offload (bool): offload frozen parameters to cpu when not in use
offload_ratio (float): ratio of parameters to offload to cpu when not in use
delay_lora_init (bool): initialize lora parameters at time of model init or allow manual init later
target_mods (str): target module names to apply LoRA to, defaults to llama-3.1 arch
"""
lora_r: int = 64
lora_alpha: float = 16.
base_weight_sharding: int = 1
offload: bool = False
offload_ratio: float = 0.0
delay_lora_init: bool = False
target_mods: List[str] = field(
default_factory=lambda: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'])


@dataclass
Expand Down
90 changes: 90 additions & 0 deletions deepspeed/linear/context_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from .optimized_linear import LoRAOptimizedLinear, OptimizedLinear

import torch

try:
import transformers
except ImportError:
transformers = None


def init_lora(model):
model.requires_grad_(False)
for m in model.modules():
if isinstance(m, LoRAOptimizedLinear):
m.init_lora()


class Init(object):
"""
Init context wrapper similar in style to zero.Init. Allows for injecting OptimizedLinear during model
construction which will shard base weights and reduce overall memory usage during model init. Primarily
useful when initializing a model via transformers.AutoModelForCausalLM.
Example usage:
lora_config = deepspeed.linear.LoRAConfig(..)
quant_config = deepspeed.linear.QuantizationConfig(..)
with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B")
"""

def __init__(self, lora_config=None, quant_config=None):
self._orig_nn_linear = torch.nn.Linear
self._orig_causallm_pretrained = None
if transformers != None:
self._orig_causallm_pretrained = transformers.AutoModelForCausalLM.from_pretrained
self._orig_causallm_config = transformers.AutoModelForCausalLM.from_config
self.lora_config = lora_config
self.quant_config = quant_config
self._post_init_complete = False

def __enter__(self):

class OptLinearWrapper:
_orig_nn_linear = self._orig_nn_linear
_lora_config = self.lora_config
_quant_config = self.quant_config

def __new__(self, *args, **kwargs):
self._lora_config.delay_lora_init = True
kwargs['lora_config'] = self._lora_config
kwargs['quantization_config'] = self._quant_config
kwargs['linear_cls'] = self._orig_nn_linear
return OptimizedLinear(*args, **kwargs)

def _model_init(model):
if self.lora_config != None:
init_lora(model)
self._post_init_complete = True
return model

# ensures non-lora params are frozen and lora weights are initialized
def from_pretrained(*args, **kwargs):
model = self._orig_causallm_pretrained(*args, **kwargs)
return _model_init(model)

def from_config(*args, **kwargs):
model = self._orig_causallm_config(*args, **kwargs)
return _model_init(model)

torch.nn.Linear = OptLinearWrapper
if transformers != None:
transformers.AutoModelForCausalLM.from_pretrained = from_pretrained
transformers.AutoModelForCausalLM.from_config = from_config

def __exit__(self, *args, **kwargs):
torch.nn.Linear = self._orig_nn_linear
if not self._post_init_complete:
print('WARNING: For some reason LoRA modules are not initialized, this is usually done automatically '
'if using transformers via (AutoModelForCausalLM from_pretrained/from_config). '
'You must call `init_lora` on each module in order to use DeepSpeed LoRA, otherwise '
'you will error out during runtime.')
else:
transformers.AutoModelForCausalLM.from_pretrained = self._orig_causallm_pretrained
transformers.AutoModelForCausalLM.from_config = self._orig_causallm_config
Loading

0 comments on commit 33453bf

Please sign in to comment.