Refactor to support accelerator_type instead of num_gpus

Signed-off-by: maheedhar reddy chappidi <[email protected]>
ray-project · Aug 4, 2023 · faf7e9f · faf7e9f
1 parent 4978631
commit faf7e9f
Show file tree

Hide file tree

Showing 12 changed files with 353 additions and 57 deletions.
diff --git a/python/ray/__init__.py b/python/ray/__init__.py
@@ -197,6 +197,7 @@ def __getattr__(self, attr):
     "get",
     "get_actor",
     "get_gpu_ids",
+    "get_neuron_core_ids",
     "init",
     "is_initialized",
     "java_actor_class",
@@ -223,6 +224,7 @@ def __getattr__(self, attr):
     "get",
     "get_actor",
     "get_gpu_ids",
+    "get_neuron_core_ids",
     "kill",
     "put",
     "wait",

diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py
@@ -388,7 +388,24 @@ def env_set_by_user(key):
 
 LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]
 
+# Accelerator constants
+NOSET_AWS_NEURON_VISIBLE_CORES_ENV_VAR = (
+    "RAY_EXPERIMENTAL_NOSET_AWS_NEURON_VISIBLE_CORES"
+)
 NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
+NUM_NEURON_CORES = "num_neuron_cores"
+# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
+# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
+# Subject to removal after the information is available via public API
+AWS_NEURON_INSTANCE_MAP = {
+    "trn1.2xlarge": 2,
+    "trn1.32xlarge": 32,
+    "trn1n.32xlarge": 32,
+    "inf2.2xlarge": 2,
+    "inf2.8xlarge": 2,
+    "inf2.24xlarge": 12,
+    "inf2.48xlarge": 24,
+}
 RAY_WORKER_NICENESS = "RAY_worker_niceness"
 
 # Default max_retries option in @ray.remote for non-actor

diff --git a/python/ray/_private/resource_spec.py b/python/ray/_private/resource_spec.py
@@ -1,4 +1,5 @@
 import importlib.util
+import json
 import logging
 import os
 import re
@@ -9,6 +10,7 @@
 
 import ray
 import ray._private.ray_constants as ray_constants
+from ray._private.utils import get_neuron_core_constraint_name
 
 try:
     import GPUtil
@@ -198,6 +200,24 @@ def resolve(self, is_head: bool, node_ip_address: Optional[str] = None):
         except Exception:
             logger.exception("Could not parse gpu information.")
 
+        # 1. Check if the user specified num_neuron_cores in resources
+        num_neuron_cores = resources.get(ray_constants.NUM_NEURON_CORES, None)
+        # 2. Auto-detect num_neuron_cores if not specified in resources
+        if num_neuron_cores is None:
+            num_neuron_cores = _autodetect_aws_neuron_cores()
+        if num_neuron_cores is not None:
+            if num_gpus is not None:
+                raise ValueError("Cannot specify both num_gpus and num_neuron_cores.")
+
+            # 3. Update accelerator_type and num_neuron_cores with
+            # number of neuron cores detected or configured.
+            resources.update(
+                {
+                    ray_constants.NUM_NEURON_CORES: num_neuron_cores,
+                    get_neuron_core_constraint_name(): num_neuron_cores,
+                }
+            )
+
         # Choose a default object store size.
         system_memory = ray._private.utils.get_system_memory()
         avail_memory = ray._private.utils.estimate_available_memory()
@@ -300,17 +320,42 @@ def _autodetect_num_gpus():
     return result
 
 
+def _autodetect_aws_neuron_cores():
+    """
+    Attempt to detect the number of Neuron cores on this machine.
+
+    Returns:
+        The number of Neuron cores if any were detected, otherwise None.
+    """
+    result = None
+    if sys.platform.startswith("linux") and os.path.isdir("/opt/aws/neuron/bin/"):
+        result = _get_neuron_core_count()
+    return result
+
+
+def _get_neuron_core_count():
+    neuron_path = "/opt/aws/neuron/bin/"
+    nc_count: int = 0
+    result = subprocess.run(
+        [os.path.join(neuron_path, "neuron-ls"), "--json-output"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    if result.returncode == 0 and result.stdout:
+        json_out = json.loads(result.stdout)
+        for neuron_device in json_out:
+            nc_count += neuron_device["nc_count"]
+    return nc_count
+
+
 def _get_gpu_types_gputil():
     gpu_list = GPUtil.getGPUs()
     if len(gpu_list) > 0:
         gpu_list_names = [gpu.name for gpu in gpu_list]
         info_str = gpu_list_names.pop()
-        pretty_name = _pretty_gpu_name(info_str)
+        pretty_name = _pretty_nvidia_gpu_name(info_str)
         if pretty_name:
-            constraint_name = (
-                f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
-            )
-            return {constraint_name: 1}
+            return {ray._private.utils.get_constraint_name(pretty_name): 1}
     return {}
 
 
@@ -336,11 +381,8 @@ def _constraints_from_gpu_info(info_str: str):
         if k.strip() == "Model":
             full_model_name = v.strip()
             break
-    pretty_name = _pretty_gpu_name(full_model_name)
-    if pretty_name:
-        constraint_name = f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
-        return {constraint_name: 1}
-    return {}
+    pretty_name = _pretty_nvidia_gpu_name(full_model_name)
+    return {ray._private.utils.get_constraint_name(pretty_name): 1}
 
 
 def _get_gpu_info_string():
@@ -364,11 +406,11 @@ def _get_gpu_info_string():
 
 # TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have
 # the form "Tesla V100-SXM2-16GB" or "Tesla K80").
-GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")
+NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")
 
 
-def _pretty_gpu_name(name):
+def _pretty_nvidia_gpu_name(name):
     if name is None:
         return None
-    match = GPU_NAME_PATTERN.match(name)
+    match = NVIDIA_GPU_NAME_PATTERN.match(name)
     return match.group(1) if match else None
diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py
@@ -44,11 +44,11 @@
 from ray.core.generated.runtime_env_common_pb2 import (
     RuntimeEnvInfo as ProtoRuntimeEnvInfo,
 )
+from ray.util.accelerators import AWS_NEURON_CORE
 
 if TYPE_CHECKING:
     from ray.runtime_env import RuntimeEnv
 
-
 pwd = None
 if sys.platform != "win32":
     import pwd
@@ -64,15 +64,13 @@
 win32_job = None
 win32_AssignProcessToJobObject = None
 
-
 ENV_DISABLE_DOCKER_CPU_WARNING = "RAY_DISABLE_DOCKER_CPU_WARNING" in os.environ
 _PYARROW_VERSION = None
 
 # This global variable is used for testing only
 _CALLED_FREQ = defaultdict(lambda: 0)
 _CALLED_FREQ_LOCK = threading.Lock()
 
-
 PLACEMENT_GROUP_INDEXED_BUNDLED_RESOURCE_PATTERN = re.compile(
     r"(.+)_group_(\d+)_([0-9a-zA-Z]+)"
 )
@@ -279,29 +277,45 @@ def compute_driver_id_from_job(job_id):
 
 
 def get_cuda_visible_devices():
-    """Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable.
+    """
+    Get the device IDs using CUDA_VISIBLE_DEVICES environment variable.
+    """
+    return _get_visible_ids(env_var="CUDA_VISIBLE_DEVICES")
+
+
+def get_aws_neuron_core_visible_ids():
+    """
+    Get the device IDs using NEURON_RT_VISIBLE_CORES environment variable.
+    """
+    return _get_visible_ids(env_var="NEURON_RT_VISIBLE_CORES")
+
+
+def _get_visible_ids(env_var: str):
+    """Get the device IDs from defined environment variable.
+    Args:
+        env_var: Environment variable to set based on the accelerator runtime.
 
     Returns:
-        devices (List[str]): If CUDA_VISIBLE_DEVICES is set, returns a
-            list of strings representing the IDs of the visible GPUs.
+        devices (List[str]): If environment variable is set, returns a
+            list of strings representing the IDs of the visible devices or cores.
             If it is not set or is set to NoDevFiles, returns empty list.
     """
-    gpu_ids_str = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    visible_ids_str = os.environ.get(env_var, None)
 
-    if gpu_ids_str is None:
+    if visible_ids_str is None:
         return None
 
-    if gpu_ids_str == "":
+    if visible_ids_str == "":
         return []
 
-    if gpu_ids_str == "NoDevFiles":
+    if visible_ids_str == "NoDevFiles":
         return []
 
-    # GPU identifiers are given as strings representing integers or UUIDs.
-    return list(gpu_ids_str.split(","))
+    # Identifiers are given as strings representing integers or UUIDs.
+    return list(visible_ids_str.split(","))
 
 
-last_set_gpu_ids = None
+last_set_visible_ids = None
 
 
 def set_omp_num_threads_if_unset() -> bool:
@@ -350,16 +364,51 @@ def set_cuda_visible_devices(gpu_ids):
     Args:
         gpu_ids (List[str]): List of strings representing GPU IDs.
     """
-
     if os.environ.get(ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR):
         return
+    set_visible_ids(gpu_ids, "CUDA_VISIBLE_DEVICES")
+
+
+def set_aws_neuron_core_visible_ids(core_ids):
+    """Set the NEURON_RT_VISIBLE_CORES environment variable.
+
+    Args:
+        core_ids (List[str]): List of strings representing core IDs.
+    """
+    if os.environ.get(ray_constants.NOSET_AWS_NEURON_VISIBLE_CORES_ENV_VAR):
+        return
+    set_visible_ids(core_ids, "NEURON_RT_VISIBLE_CORES")
+
+
+def set_visible_ids(visible_ids, env_var: str):
+    """Set the environment variable passed based on accelerator runtime.
+
+    Args:
+        visible_ids (List[str]): List of strings representing GPU IDs.
+        env_var: Environment variable to set based on GPU runtime.
+
+    """
 
-    global last_set_gpu_ids
-    if last_set_gpu_ids == gpu_ids:
+    global last_set_visible_ids
+    if last_set_visible_ids == visible_ids:
         return  # optimization: already set
 
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids])
-    last_set_gpu_ids = gpu_ids
+    os.environ[env_var] = ",".join([str(i) for i in visible_ids])
+    last_set_visible_ids = visible_ids
+
+
+def get_neuron_core_constraint_name():
+    """Get the name of the constraint that represents the AWS Neuron core accelerator.
+
+    Returns:
+        (str) The constraint name.
+    """
+    return get_constraint_name(AWS_NEURON_CORE)
+
+
+def get_constraint_name(pretty_name: str):
+    constraint_name = f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
+    return constraint_name
 
 
 def resources_from_ray_options(options_dict: Dict[str, Any]) -> Dict[str, Any]:
@@ -397,6 +446,7 @@ def resources_from_ray_options(options_dict: Dict[str, Any]) -> Dict[str, Any]:
         resources["memory"] = int(memory)
     if object_store_memory is not None:
         resources["object_store_memory"] = object_store_memory
+    # TODO: Confirm if value is valid
     if accelerator_type is not None:
         resources[
             f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}"