Merge branch 'master' into remove_validate_args

microsoft · Sep 12, 2024 · 41d8978 · 41d8978
2 parents 75bef28 + 170b46e
commit 41d8978
Show file tree

Hide file tree

Showing 29 changed files with 106 additions and 64 deletions.
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
@@ -18,6 +18,7 @@ def __init__(self):
         self._name = 'hpu'
         self._communication_backend_name = 'hccl'
         self._compile_backend = "hpu_backend"
+        self.apply_hpu_workarounds()
         try:
             import habana_frameworks.torch.hpu as hpu
             hpu.setDeterministic(True)
@@ -28,6 +29,15 @@ def __init__(self):
 
         self.fp16_supported = None
 
+    def apply_hpu_workarounds(self):
+
+        def update_wa_env_var(key, value):
+            if key not in os.environ.keys():
+                os.environ[key] = value
+
+        update_wa_env_var("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+        update_wa_env_var("PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES", "0")
+
     # Device APIs
     def is_synchronized_device(self):
         return False

diff --git a/bin/ds_bench b/bin/ds_bench
@@ -10,7 +10,10 @@ import sys
 required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
 if not all(map(lambda v: v in os.environ, required_env)):
     import subprocess
-    subprocess.run("deepspeed $(which ds_bench) " + " ".join(sys.argv[1:]), shell=True)
+    r = subprocess.check_output(["which", "ds_bench"])
+    ds_bench_bin = r.decode('utf-8').strip()
+    safe_cmd = ["deepspeed", ds_bench_bin] + sys.argv[1:]
+    subprocess.run(safe_cmd)
 else:
     args = benchmark_parser().parse_args()
     rank = args.local_rank

diff --git a/csrc/aio/py_test/ds_aio_job.py b/csrc/aio/py_test/ds_aio_job.py
@@ -6,6 +6,7 @@
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 import subprocess
+import shlex
 
 
 class Job(object):
@@ -39,10 +40,10 @@ def close_output_file(self):
 
 
 def run_job(job):
-    args = ' '.join(job.cmd())
+    args = shlex.split(' '.join(job.cmd()))
     print(f'args = {args}')
     job.open_output_file()
-    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
+    proc = subprocess.run(args=args, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
     job.close_output_file()
     assert proc.returncode == 0, \
     f"This command failed: {job.cmd()}"
diff --git a/csrc/fp_quantizer/fp_quantize.cu b/csrc/fp_quantizer/fp_quantize.cu
@@ -15,7 +15,9 @@
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
 
+#ifdef BF16_AVAILABLE
 #include <cuda_bf16.h>
+#endif
 #include <cuda_runtime_api.h>
 
 using ROp = reduce::ROpType;

diff --git a/csrc/fp_quantizer/includes/fp_quantize.h b/csrc/fp_quantizer/includes/fp_quantize.h
@@ -10,7 +10,9 @@
 
 #include <cuda_fp16.h>
 
+#ifdef BF16_AVAILABLE
 #include <cuda_bf16.h>
+#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 

diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py
@@ -697,8 +697,9 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True)
 
     master_addr = None
     if rank == 0:
-        hostname_cmd = ["hostname -I"]
-        result = subprocess.check_output(hostname_cmd, shell=True)
+        import shlex
+        hostname_cmd = shlex.split("hostname -I")
+        result = subprocess.check_output(hostname_cmd)
         master_addr = result.decode('utf-8').split()[0]
     master_addr = comm.bcast(master_addr, root=0)
 

diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
@@ -54,7 +54,9 @@ def _set_master_addr_port(store: Store,
 
         if master_addr is None:
             # master_addr = _get_fq_hostname()
-            result = subprocess.check_output("hostname -I", shell=True)
+            import shlex
+            safe_cmd = shlex.split("hostname -I")
+            result = subprocess.check_output(safe_cmd)
             master_addr = result.decode('utf-8').split()[0]
 
         store.set("MASTER_ADDR", master_addr.encode(encoding="UTF-8"))

diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
@@ -34,7 +34,7 @@ def get_cmd(self, environment, active_resources):
         """Return the command to execute on node"""
 
     def add_export(self, key, var):
-        self.exports[key.strip()] = var.strip()
+        self.exports[key.strip()] = f"\"{var.strip()}\""
 
     def parse_user_args(self):
         return self.args.user_args
@@ -406,7 +406,7 @@ def backend_exists(self):
         if not mpiname_exists:
             warnings.warn("mpiname does not exist, mvapich is not installed properly")
         else:
-            results = subprocess.check_output('mpiname', shell=True)
+            results = subprocess.check_output(['mpiname'])
             mpiname_results = results.decode('utf-8').strip()
             if "MVAPICH2-GDR" in mpiname_results:
                 exists = True

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
@@ -20,6 +20,7 @@
 from copy import deepcopy
 import signal
 import time
+import shlex
 
 from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, IMPIRunner
 from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER, IMPI_LAUNCHER
@@ -445,7 +446,8 @@ def main(args=None):
             if args.ssh_port is not None:
                 ssh_check_cmd += f"-p {args.ssh_port} "
             ssh_check_cmd += f"{first_host} hostname"
-            subprocess.check_call(ssh_check_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, shell=True)
+            safe_ssh_cmd = shlex.split(ssh_check_cmd)
+            subprocess.check_call(safe_ssh_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
         except subprocess.CalledProcessError:
             raise RuntimeError(
                 f"Using hostfile at {args.hostfile} but host={first_host} was not reachable via ssh. If you are running with a single node please remove {args.hostfile} or setup passwordless ssh."
@@ -458,9 +460,9 @@ def main(args=None):
         if args.ssh_port is not None:
             ssh_check_cmd += f" -p {args.ssh_port}"
         ssh_check_cmd += f" {first_host} hostname -I"
-        hostname_cmd = [ssh_check_cmd]
+        hostname_cmd = shlex.split(ssh_check_cmd)
         try:
-            result = subprocess.check_output(hostname_cmd, shell=True)
+            result = subprocess.check_output(hostname_cmd)
         except subprocess.CalledProcessError as err:
             logger.error(
                 "Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"

diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
@@ -15,6 +15,7 @@
 from deepspeed.utils import logger
 from deepspeed.moe.layer import MoE
 from deepspeed.utils.timer import FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, STEP_GLOBAL_TIMER
+from deepspeed.utils.torch import required_torch_version
 
 Tensor = torch.Tensor
 
@@ -908,8 +909,9 @@ def _patch_functionals():
     # embedding
     F.embedding = wrapFunc(F.embedding, _embedding_flops_compute)
 
-    # attn
-    F.scaled_dot_product_attention = wrapFunc(F.scaled_dot_product_attention, _attn_flops_compute)
+    # attn - scaled_dot_product_attention added in torch 2.0+
+    if required_torch_version(min_version=2.0):
+        F.scaled_dot_product_attention = wrapFunc(F.scaled_dot_product_attention, _attn_flops_compute)
 
 
 def _patch_tensor_methods():

diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
@@ -534,7 +534,7 @@ def state(self):
 
     def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx):
         assert self.immediate_grad_update
-        self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=True)
+        self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=False)
 
     def create_grad_acc_hooks(self):
         self.grad_accs = []

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
@@ -482,9 +482,7 @@ def eval_batch(self,
         micro_batches = self.micro_batches if num_micro_batches is None else num_micro_batches
 
         # Do the work
-        sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
-                                           stages=self.num_stages,
-                                           stage_id=self.stage_id)
+        sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=self.num_stages, stage_id=self.stage_id)
 
         # prevent dead-lock with multiple evals sequence
         dist.barrier()

diff --git a/docs/_tutorials/accelerator-setup-guide.md b/docs/_tutorials/accelerator-setup-guide.md
@@ -95,8 +95,9 @@ ipex_model = ipex.llm.optimize(deepspeed_model)
 ```
 to get model optimzied by Intel Extension for PyTorch.
 
-## More example for using DeepSpeed with Intel Extension for PyTorch on Intel Architecture CPU
-Refer to https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm for more extensive guide.
+## More examples for using DeepSpeed on Intel CPU
+Refer to [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) for more code samples of running inference with DeepSpeed on Intel CPU.
+
 
 # Intel XPU
 DeepSpeed XPU accelerator supports Intel® Data Center GPU Max Series.
@@ -131,8 +132,8 @@ XPU available: True
 accelerator: xpu
 ```
 
-## More example for using DeepSpeed on Intel XPU
-Refer to https://github.com/intel/intel-extension-for-pytorch/tree/release/xpu/2.1.40/examples/gpu/inference/python/llm for more extensive guide.
+## More examples for using DeepSpeed on Intel XPU
+Refer to [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/llm), [Megatron-DeepSpeed training examples](https://github.com/intel/intel-extension-for-deepspeed/tree/main/examples) for more code samples of running LLM with DeepSpeed on Intel XPU.
 
 
 # Huawei Ascend NPU

diff --git a/op_builder/builder.py b/op_builder/builder.py
@@ -253,7 +253,8 @@ def get_rocm_gpu_arch():
             rocm_info = Path("rocminfo")
         rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
         try:
-            result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
+            safe_cmd = shlex.split(rocm_gpu_arch_cmd)
+            result = subprocess.check_output(safe_cmd)
             rocm_gpu_arch = result.decode('utf-8').strip()
         except subprocess.CalledProcessError:
             rocm_gpu_arch = ""
@@ -271,7 +272,8 @@ def get_rocm_wavefront_size():
         rocm_wavefront_size_cmd = str(
             rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
         try:
-            result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
+            safe_cmd = shlex.split(rocm_wavefront_size_cmd)
+            result = subprocess.check_output(rocm_wavefront_size_cmd)
             rocm_wavefront_size = result.decode('utf-8').strip()
         except subprocess.CalledProcessError:
             rocm_wavefront_size = "32"
@@ -296,7 +298,7 @@ def cxx_args(self):
         '''
         return []
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         '''
         Check if all non-python dependencies are satisfied to build this op
         '''
@@ -432,7 +434,7 @@ def _backup_cpuinfo(self):
                          "to detect the CPU architecture. 'lscpu' does not appear to exist on "
                          "your system, will fall back to use -march=native and non-vectorized execution.")
             return None
-        result = subprocess.check_output('lscpu', shell=True)
+        result = subprocess.check_output(['lscpu'])
         result = result.decode('utf-8').strip().lower()
 
         cpu_info = {}
@@ -679,7 +681,7 @@ def version_dependent_macros(self):
             version_ge_1_5 = ['-DVERSION_GE_1_5']
         return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         return super().is_compatible(verbose)
 
     def builder(self):

diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py
@@ -28,7 +28,7 @@ def include_paths(self):
     def cxx_args(self):
         return ['-O2', '-fopenmp']
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # TODO: add soft compatibility check for private binary release.
         #  a soft check, as in we know it can be trivially changed.
         return super().is_compatible(verbose)
@@ -65,7 +65,7 @@ def include_paths(self):
     def cxx_args(self):
         return ['-O2', '-fopenmp']
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # TODO: add soft compatibility check for private binary release.
         #  a soft check, as in we know it can be trivially changed.
         return super().is_compatible(verbose)
diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py
@@ -19,7 +19,7 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.kernels{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:

diff --git a/op_builder/inference_cutlass_builder.py b/op_builder/inference_cutlass_builder.py
@@ -18,7 +18,7 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.kernels.cutlass_ops.{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:

diff --git a/op_builder/npu/async_io.py b/op_builder/npu/async_io.py
@@ -83,7 +83,7 @@ def check_for_libaio_pkg(self):
                 break
         return found
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # Check for the existence of libaio by using distutils
         # to compile and link a test program that calls io_submit,
         # which is a function provided by libaio that is used in the async_io op.

diff --git a/op_builder/ragged_ops.py b/op_builder/ragged_ops.py
@@ -19,7 +19,7 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.kernels.ragged_ops.{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:

diff --git a/op_builder/ragged_utils.py b/op_builder/ragged_utils.py
@@ -19,7 +19,7 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:

diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
@@ -64,7 +64,6 @@ def is_compatible(self, verbose=False):
             if verbose:
                 self.warning(
                     f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
-
         try:
             import triton
         except ImportError:

diff --git a/op_builder/spatial_inference.py b/op_builder/spatial_inference.py
@@ -17,7 +17,7 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.ops.spatial.{self.NAME}_op'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:

diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py
@@ -17,7 +17,7 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.ops.transformer.inference.{self.NAME}_op'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:

diff --git a/op_builder/xpu/async_io.py b/op_builder/xpu/async_io.py
@@ -79,7 +79,7 @@ def check_for_libaio_pkg(self):
                 break
         return found
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # Check for the existence of libaio by using distutils
         # to compile and link a test program that calls io_submit,
         # which is a function provided by libaio that is used in the async_io op.

diff --git a/setup.py b/setup.py
@@ -27,6 +27,7 @@
 from setuptools.command import egg_info
 import time
 import typing
+import shlex
 
 torch_available = True
 try:
@@ -157,10 +158,11 @@ def get_env_if_set(key, default: typing.Any = ""):
 
 def command_exists(cmd):
     if sys.platform == "win32":
-        result = subprocess.Popen(f'{cmd}', stdout=subprocess.PIPE, shell=True)
+        safe_cmd = shlex.split(f'{cmd}')
+        result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
         return result.wait() == 1
     else:
-        safe_cmd = ["bash", "-c", f"type {cmd}"]
+        safe_cmd = shlex.split(f"bash -c type {cmd}")
         result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
         return result.wait() == 0
 
@@ -200,13 +202,13 @@ def op_enabled(op_name):
 print(f'Install Ops={install_ops}')
 
 # Write out version/git info.
-git_hash_cmd = "git rev-parse --short HEAD"
-git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
+git_hash_cmd = shlex.split("bash -c git rev-parse --short HEAD")
+git_branch_cmd = shlex.split("bash -c git rev-parse --abbrev-ref HEAD")
 if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
     try:
-        result = subprocess.check_output(git_hash_cmd, shell=True)
+        result = subprocess.check_output(git_hash_cmd)
         git_hash = result.decode('utf-8').strip()
-        result = subprocess.check_output(git_branch_cmd, shell=True)
+        result = subprocess.check_output(git_branch_cmd)
         git_branch = result.decode('utf-8').strip()
     except subprocess.CalledProcessError:
         git_hash = "unknown"