Skip to content

Commit

Permalink
Merge branch 'master' into remove_validate_args
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Sep 12, 2024
2 parents 75bef28 + 170b46e commit 41d8978
Show file tree
Hide file tree
Showing 29 changed files with 106 additions and 64 deletions.
10 changes: 10 additions & 0 deletions accelerator/hpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self):
self._name = 'hpu'
self._communication_backend_name = 'hccl'
self._compile_backend = "hpu_backend"
self.apply_hpu_workarounds()
try:
import habana_frameworks.torch.hpu as hpu
hpu.setDeterministic(True)
Expand All @@ -28,6 +29,15 @@ def __init__(self):

self.fp16_supported = None

def apply_hpu_workarounds(self):

def update_wa_env_var(key, value):
if key not in os.environ.keys():
os.environ[key] = value

update_wa_env_var("PT_HPU_LAZY_ACC_PAR_MODE", "0")
update_wa_env_var("PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES", "0")

# Device APIs
def is_synchronized_device(self):
return False
Expand Down
5 changes: 4 additions & 1 deletion bin/ds_bench
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ import sys
required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
if not all(map(lambda v: v in os.environ, required_env)):
import subprocess
subprocess.run("deepspeed $(which ds_bench) " + " ".join(sys.argv[1:]), shell=True)
r = subprocess.check_output(["which", "ds_bench"])
ds_bench_bin = r.decode('utf-8').strip()
safe_cmd = ["deepspeed", ds_bench_bin] + sys.argv[1:]
subprocess.run(safe_cmd)
else:
args = benchmark_parser().parse_args()
rank = args.local_rank
Expand Down
5 changes: 3 additions & 2 deletions csrc/aio/py_test/ds_aio_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Functionality of swapping tensors to/from (NVMe) storage devices.
"""
import subprocess
import shlex


class Job(object):
Expand Down Expand Up @@ -39,10 +40,10 @@ def close_output_file(self):


def run_job(job):
args = ' '.join(job.cmd())
args = shlex.split(' '.join(job.cmd()))
print(f'args = {args}')
job.open_output_file()
proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
proc = subprocess.run(args=args, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
job.close_output_file()
assert proc.returncode == 0, \
f"This command failed: {job.cmd()}"
2 changes: 2 additions & 0 deletions csrc/fp_quantizer/fp_quantize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
#include <cuda_fp16.h>
#include <curand_kernel.h>

#ifdef BF16_AVAILABLE
#include <cuda_bf16.h>
#endif
#include <cuda_runtime_api.h>

using ROp = reduce::ROpType;
Expand Down
2 changes: 2 additions & 0 deletions csrc/fp_quantizer/includes/fp_quantize.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

#include <cuda_fp16.h>

#ifdef BF16_AVAILABLE
#include <cuda_bf16.h>
#endif
#include <cuda_runtime_api.h>
#include <stdio.h>

Expand Down
5 changes: 3 additions & 2 deletions deepspeed/comm/comm.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,8 +697,9 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True)

master_addr = None
if rank == 0:
hostname_cmd = ["hostname -I"]
result = subprocess.check_output(hostname_cmd, shell=True)
import shlex
hostname_cmd = shlex.split("hostname -I")
result = subprocess.check_output(hostname_cmd)
master_addr = result.decode('utf-8').split()[0]
master_addr = comm.bcast(master_addr, root=0)

Expand Down
4 changes: 3 additions & 1 deletion deepspeed/elasticity/elastic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def _set_master_addr_port(store: Store,

if master_addr is None:
# master_addr = _get_fq_hostname()
result = subprocess.check_output("hostname -I", shell=True)
import shlex
safe_cmd = shlex.split("hostname -I")
result = subprocess.check_output(safe_cmd)
master_addr = result.decode('utf-8').split()[0]

store.set("MASTER_ADDR", master_addr.encode(encoding="UTF-8"))
Expand Down
4 changes: 2 additions & 2 deletions deepspeed/launcher/multinode_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def get_cmd(self, environment, active_resources):
"""Return the command to execute on node"""

def add_export(self, key, var):
self.exports[key.strip()] = var.strip()
self.exports[key.strip()] = f"\"{var.strip()}\""

def parse_user_args(self):
return self.args.user_args
Expand Down Expand Up @@ -406,7 +406,7 @@ def backend_exists(self):
if not mpiname_exists:
warnings.warn("mpiname does not exist, mvapich is not installed properly")
else:
results = subprocess.check_output('mpiname', shell=True)
results = subprocess.check_output(['mpiname'])
mpiname_results = results.decode('utf-8').strip()
if "MVAPICH2-GDR" in mpiname_results:
exists = True
Expand Down
8 changes: 5 additions & 3 deletions deepspeed/launcher/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from copy import deepcopy
import signal
import time
import shlex

from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, IMPIRunner
from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER, IMPI_LAUNCHER
Expand Down Expand Up @@ -445,7 +446,8 @@ def main(args=None):
if args.ssh_port is not None:
ssh_check_cmd += f"-p {args.ssh_port} "
ssh_check_cmd += f"{first_host} hostname"
subprocess.check_call(ssh_check_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, shell=True)
safe_ssh_cmd = shlex.split(ssh_check_cmd)
subprocess.check_call(safe_ssh_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
except subprocess.CalledProcessError:
raise RuntimeError(
f"Using hostfile at {args.hostfile} but host={first_host} was not reachable via ssh. If you are running with a single node please remove {args.hostfile} or setup passwordless ssh."
Expand All @@ -458,9 +460,9 @@ def main(args=None):
if args.ssh_port is not None:
ssh_check_cmd += f" -p {args.ssh_port}"
ssh_check_cmd += f" {first_host} hostname -I"
hostname_cmd = [ssh_check_cmd]
hostname_cmd = shlex.split(ssh_check_cmd)
try:
result = subprocess.check_output(hostname_cmd, shell=True)
result = subprocess.check_output(hostname_cmd)
except subprocess.CalledProcessError as err:
logger.error(
"Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
Expand Down
6 changes: 4 additions & 2 deletions deepspeed/profiling/flops_profiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from deepspeed.utils import logger
from deepspeed.moe.layer import MoE
from deepspeed.utils.timer import FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, STEP_GLOBAL_TIMER
from deepspeed.utils.torch import required_torch_version

Tensor = torch.Tensor

Expand Down Expand Up @@ -908,8 +909,9 @@ def _patch_functionals():
# embedding
F.embedding = wrapFunc(F.embedding, _embedding_flops_compute)

# attn
F.scaled_dot_product_attention = wrapFunc(F.scaled_dot_product_attention, _attn_flops_compute)
# attn - scaled_dot_product_attention added in torch 2.0+
if required_torch_version(min_version=2.0):
F.scaled_dot_product_attention = wrapFunc(F.scaled_dot_product_attention, _attn_flops_compute)


def _patch_tensor_methods():
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/bf16_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ def state(self):

def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx):
assert self.immediate_grad_update
self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=True)
self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=False)

def create_grad_acc_hooks(self):
self.grad_accs = []
Expand Down
4 changes: 1 addition & 3 deletions deepspeed/runtime/pipe/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,7 @@ def eval_batch(self,
micro_batches = self.micro_batches if num_micro_batches is None else num_micro_batches

# Do the work
sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
stages=self.num_stages,
stage_id=self.stage_id)
sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=self.num_stages, stage_id=self.stage_id)

# prevent dead-lock with multiple evals sequence
dist.barrier()
Expand Down
9 changes: 5 additions & 4 deletions docs/_tutorials/accelerator-setup-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ ipex_model = ipex.llm.optimize(deepspeed_model)
```
to get model optimzied by Intel Extension for PyTorch.

## More example for using DeepSpeed with Intel Extension for PyTorch on Intel Architecture CPU
Refer to https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm for more extensive guide.
## More examples for using DeepSpeed on Intel CPU
Refer to [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) for more code samples of running inference with DeepSpeed on Intel CPU.


# Intel XPU
DeepSpeed XPU accelerator supports Intel® Data Center GPU Max Series.
Expand Down Expand Up @@ -131,8 +132,8 @@ XPU available: True
accelerator: xpu
```

## More example for using DeepSpeed on Intel XPU
Refer to https://github.com/intel/intel-extension-for-pytorch/tree/release/xpu/2.1.40/examples/gpu/inference/python/llm for more extensive guide.
## More examples for using DeepSpeed on Intel XPU
Refer to [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/llm), [Megatron-DeepSpeed training examples](https://github.com/intel/intel-extension-for-deepspeed/tree/main/examples) for more code samples of running LLM with DeepSpeed on Intel XPU.


# Huawei Ascend NPU
Expand Down
12 changes: 7 additions & 5 deletions op_builder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ def get_rocm_gpu_arch():
rocm_info = Path("rocminfo")
rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
try:
result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
safe_cmd = shlex.split(rocm_gpu_arch_cmd)
result = subprocess.check_output(safe_cmd)
rocm_gpu_arch = result.decode('utf-8').strip()
except subprocess.CalledProcessError:
rocm_gpu_arch = ""
Expand All @@ -271,7 +272,8 @@ def get_rocm_wavefront_size():
rocm_wavefront_size_cmd = str(
rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
try:
result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
safe_cmd = shlex.split(rocm_wavefront_size_cmd)
result = subprocess.check_output(rocm_wavefront_size_cmd)
rocm_wavefront_size = result.decode('utf-8').strip()
except subprocess.CalledProcessError:
rocm_wavefront_size = "32"
Expand All @@ -296,7 +298,7 @@ def cxx_args(self):
'''
return []

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
'''
Check if all non-python dependencies are satisfied to build this op
'''
Expand Down Expand Up @@ -432,7 +434,7 @@ def _backup_cpuinfo(self):
"to detect the CPU architecture. 'lscpu' does not appear to exist on "
"your system, will fall back to use -march=native and non-vectorized execution.")
return None
result = subprocess.check_output('lscpu', shell=True)
result = subprocess.check_output(['lscpu'])
result = result.decode('utf-8').strip().lower()

cpu_info = {}
Expand Down Expand Up @@ -679,7 +681,7 @@ def version_dependent_macros(self):
version_ge_1_5 = ['-DVERSION_GE_1_5']
return version_ge_1_1 + version_ge_1_3 + version_ge_1_5

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
return super().is_compatible(verbose)

def builder(self):
Expand Down
4 changes: 2 additions & 2 deletions op_builder/cpu/comm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def include_paths(self):
def cxx_args(self):
return ['-O2', '-fopenmp']

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
# TODO: add soft compatibility check for private binary release.
# a soft check, as in we know it can be trivially changed.
return super().is_compatible(verbose)
Expand Down Expand Up @@ -65,7 +65,7 @@ def include_paths(self):
def cxx_args(self):
return ['-O2', '-fopenmp']

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
# TODO: add soft compatibility check for private binary release.
# a soft check, as in we know it can be trivially changed.
return super().is_compatible(verbose)
2 changes: 1 addition & 1 deletion op_builder/inference_core_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, name=None):
def absolute_name(self):
return f'deepspeed.inference.v2.kernels{self.NAME}'

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion op_builder/inference_cutlass_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self, name=None):
def absolute_name(self):
return f'deepspeed.inference.v2.kernels.cutlass_ops.{self.NAME}'

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion op_builder/npu/async_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def check_for_libaio_pkg(self):
break
return found

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
# Check for the existence of libaio by using distutils
# to compile and link a test program that calls io_submit,
# which is a function provided by libaio that is used in the async_io op.
Expand Down
2 changes: 1 addition & 1 deletion op_builder/ragged_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, name=None):
def absolute_name(self):
return f'deepspeed.inference.v2.kernels.ragged_ops.{self.NAME}'

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion op_builder/ragged_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, name=None):
def absolute_name(self):
return f'deepspeed.inference.v2.{self.NAME}'

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
Expand Down
1 change: 0 additions & 1 deletion op_builder/sparse_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def is_compatible(self, verbose=False):
if verbose:
self.warning(
f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')

try:
import triton
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion op_builder/spatial_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, name=None):
def absolute_name(self):
return f'deepspeed.ops.spatial.{self.NAME}_op'

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion op_builder/transformer_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, name=None):
def absolute_name(self):
return f'deepspeed.ops.transformer.inference.{self.NAME}_op'

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion op_builder/xpu/async_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def check_for_libaio_pkg(self):
break
return found

def is_compatible(self, verbose=True):
def is_compatible(self, verbose=False):
# Check for the existence of libaio by using distutils
# to compile and link a test program that calls io_submit,
# which is a function provided by libaio that is used in the async_io op.
Expand Down
14 changes: 8 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from setuptools.command import egg_info
import time
import typing
import shlex

torch_available = True
try:
Expand Down Expand Up @@ -157,10 +158,11 @@ def get_env_if_set(key, default: typing.Any = ""):

def command_exists(cmd):
if sys.platform == "win32":
result = subprocess.Popen(f'{cmd}', stdout=subprocess.PIPE, shell=True)
safe_cmd = shlex.split(f'{cmd}')
result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
return result.wait() == 1
else:
safe_cmd = ["bash", "-c", f"type {cmd}"]
safe_cmd = shlex.split(f"bash -c type {cmd}")
result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
return result.wait() == 0

Expand Down Expand Up @@ -200,13 +202,13 @@ def op_enabled(op_name):
print(f'Install Ops={install_ops}')

# Write out version/git info.
git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
git_hash_cmd = shlex.split("bash -c git rev-parse --short HEAD")
git_branch_cmd = shlex.split("bash -c git rev-parse --abbrev-ref HEAD")
if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
try:
result = subprocess.check_output(git_hash_cmd, shell=True)
result = subprocess.check_output(git_hash_cmd)
git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True)
result = subprocess.check_output(git_branch_cmd)
git_branch = result.decode('utf-8').strip()
except subprocess.CalledProcessError:
git_hash = "unknown"
Expand Down
Loading

0 comments on commit 41d8978

Please sign in to comment.