Skip to content

Commit

Permalink
Refactor to support accelerator_type instead of num_gpus
Browse files Browse the repository at this point in the history
Signed-off-by: maheedhar reddy chappidi <[email protected]>
  • Loading branch information
chappidim committed Aug 4, 2023
1 parent 4978631 commit faf7e9f
Show file tree
Hide file tree
Showing 12 changed files with 353 additions and 57 deletions.
2 changes: 2 additions & 0 deletions python/ray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def __getattr__(self, attr):
"get",
"get_actor",
"get_gpu_ids",
"get_neuron_core_ids",
"init",
"is_initialized",
"java_actor_class",
Expand All @@ -223,6 +224,7 @@ def __getattr__(self, attr):
"get",
"get_actor",
"get_gpu_ids",
"get_neuron_core_ids",
"kill",
"put",
"wait",
Expand Down
17 changes: 17 additions & 0 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,24 @@ def env_set_by_user(key):

LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]

# Accelerator constants
NOSET_AWS_NEURON_VISIBLE_CORES_ENV_VAR = (
"RAY_EXPERIMENTAL_NOSET_AWS_NEURON_VISIBLE_CORES"
)
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
NUM_NEURON_CORES = "num_neuron_cores"
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
# Subject to removal after the information is available via public API
AWS_NEURON_INSTANCE_MAP = {
"trn1.2xlarge": 2,
"trn1.32xlarge": 32,
"trn1n.32xlarge": 32,
"inf2.2xlarge": 2,
"inf2.8xlarge": 2,
"inf2.24xlarge": 12,
"inf2.48xlarge": 24,
}
RAY_WORKER_NICENESS = "RAY_worker_niceness"

# Default max_retries option in @ray.remote for non-actor
Expand Down
68 changes: 55 additions & 13 deletions python/ray/_private/resource_spec.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib.util
import json
import logging
import os
import re
Expand All @@ -9,6 +10,7 @@

import ray
import ray._private.ray_constants as ray_constants
from ray._private.utils import get_neuron_core_constraint_name

try:
import GPUtil
Expand Down Expand Up @@ -198,6 +200,24 @@ def resolve(self, is_head: bool, node_ip_address: Optional[str] = None):
except Exception:
logger.exception("Could not parse gpu information.")

# 1. Check if the user specified num_neuron_cores in resources
num_neuron_cores = resources.get(ray_constants.NUM_NEURON_CORES, None)
# 2. Auto-detect num_neuron_cores if not specified in resources
if num_neuron_cores is None:
num_neuron_cores = _autodetect_aws_neuron_cores()
if num_neuron_cores is not None:
if num_gpus is not None:
raise ValueError("Cannot specify both num_gpus and num_neuron_cores.")

# 3. Update accelerator_type and num_neuron_cores with
# number of neuron cores detected or configured.
resources.update(
{
ray_constants.NUM_NEURON_CORES: num_neuron_cores,
get_neuron_core_constraint_name(): num_neuron_cores,
}
)

# Choose a default object store size.
system_memory = ray._private.utils.get_system_memory()
avail_memory = ray._private.utils.estimate_available_memory()
Expand Down Expand Up @@ -300,17 +320,42 @@ def _autodetect_num_gpus():
return result


def _autodetect_aws_neuron_cores():
"""
Attempt to detect the number of Neuron cores on this machine.
Returns:
The number of Neuron cores if any were detected, otherwise None.
"""
result = None
if sys.platform.startswith("linux") and os.path.isdir("/opt/aws/neuron/bin/"):
result = _get_neuron_core_count()
return result


def _get_neuron_core_count():
neuron_path = "/opt/aws/neuron/bin/"
nc_count: int = 0
result = subprocess.run(
[os.path.join(neuron_path, "neuron-ls"), "--json-output"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if result.returncode == 0 and result.stdout:
json_out = json.loads(result.stdout)
for neuron_device in json_out:
nc_count += neuron_device["nc_count"]
return nc_count


def _get_gpu_types_gputil():
gpu_list = GPUtil.getGPUs()
if len(gpu_list) > 0:
gpu_list_names = [gpu.name for gpu in gpu_list]
info_str = gpu_list_names.pop()
pretty_name = _pretty_gpu_name(info_str)
pretty_name = _pretty_nvidia_gpu_name(info_str)
if pretty_name:
constraint_name = (
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
)
return {constraint_name: 1}
return {ray._private.utils.get_constraint_name(pretty_name): 1}
return {}


Expand All @@ -336,11 +381,8 @@ def _constraints_from_gpu_info(info_str: str):
if k.strip() == "Model":
full_model_name = v.strip()
break
pretty_name = _pretty_gpu_name(full_model_name)
if pretty_name:
constraint_name = f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
return {constraint_name: 1}
return {}
pretty_name = _pretty_nvidia_gpu_name(full_model_name)
return {ray._private.utils.get_constraint_name(pretty_name): 1}


def _get_gpu_info_string():
Expand All @@ -364,11 +406,11 @@ def _get_gpu_info_string():

# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have
# the form "Tesla V100-SXM2-16GB" or "Tesla K80").
GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")
NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")


def _pretty_gpu_name(name):
def _pretty_nvidia_gpu_name(name):
if name is None:
return None
match = GPU_NAME_PATTERN.match(name)
match = NVIDIA_GPU_NAME_PATTERN.match(name)
return match.group(1) if match else None
86 changes: 68 additions & 18 deletions python/ray/_private/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@
from ray.core.generated.runtime_env_common_pb2 import (
RuntimeEnvInfo as ProtoRuntimeEnvInfo,
)
from ray.util.accelerators import AWS_NEURON_CORE

if TYPE_CHECKING:
from ray.runtime_env import RuntimeEnv


pwd = None
if sys.platform != "win32":
import pwd
Expand All @@ -64,15 +64,13 @@
win32_job = None
win32_AssignProcessToJobObject = None


ENV_DISABLE_DOCKER_CPU_WARNING = "RAY_DISABLE_DOCKER_CPU_WARNING" in os.environ
_PYARROW_VERSION = None

# This global variable is used for testing only
_CALLED_FREQ = defaultdict(lambda: 0)
_CALLED_FREQ_LOCK = threading.Lock()


PLACEMENT_GROUP_INDEXED_BUNDLED_RESOURCE_PATTERN = re.compile(
r"(.+)_group_(\d+)_([0-9a-zA-Z]+)"
)
Expand Down Expand Up @@ -279,29 +277,45 @@ def compute_driver_id_from_job(job_id):


def get_cuda_visible_devices():
"""Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable.
"""
Get the device IDs using CUDA_VISIBLE_DEVICES environment variable.
"""
return _get_visible_ids(env_var="CUDA_VISIBLE_DEVICES")


def get_aws_neuron_core_visible_ids():
"""
Get the device IDs using NEURON_RT_VISIBLE_CORES environment variable.
"""
return _get_visible_ids(env_var="NEURON_RT_VISIBLE_CORES")


def _get_visible_ids(env_var: str):
"""Get the device IDs from defined environment variable.
Args:
env_var: Environment variable to set based on the accelerator runtime.
Returns:
devices (List[str]): If CUDA_VISIBLE_DEVICES is set, returns a
list of strings representing the IDs of the visible GPUs.
devices (List[str]): If environment variable is set, returns a
list of strings representing the IDs of the visible devices or cores.
If it is not set or is set to NoDevFiles, returns empty list.
"""
gpu_ids_str = os.environ.get("CUDA_VISIBLE_DEVICES", None)
visible_ids_str = os.environ.get(env_var, None)

if gpu_ids_str is None:
if visible_ids_str is None:
return None

if gpu_ids_str == "":
if visible_ids_str == "":
return []

if gpu_ids_str == "NoDevFiles":
if visible_ids_str == "NoDevFiles":
return []

# GPU identifiers are given as strings representing integers or UUIDs.
return list(gpu_ids_str.split(","))
# Identifiers are given as strings representing integers or UUIDs.
return list(visible_ids_str.split(","))


last_set_gpu_ids = None
last_set_visible_ids = None


def set_omp_num_threads_if_unset() -> bool:
Expand Down Expand Up @@ -350,16 +364,51 @@ def set_cuda_visible_devices(gpu_ids):
Args:
gpu_ids (List[str]): List of strings representing GPU IDs.
"""

if os.environ.get(ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR):
return
set_visible_ids(gpu_ids, "CUDA_VISIBLE_DEVICES")


def set_aws_neuron_core_visible_ids(core_ids):
"""Set the NEURON_RT_VISIBLE_CORES environment variable.
Args:
core_ids (List[str]): List of strings representing core IDs.
"""
if os.environ.get(ray_constants.NOSET_AWS_NEURON_VISIBLE_CORES_ENV_VAR):
return
set_visible_ids(core_ids, "NEURON_RT_VISIBLE_CORES")


def set_visible_ids(visible_ids, env_var: str):
"""Set the environment variable passed based on accelerator runtime.
Args:
visible_ids (List[str]): List of strings representing GPU IDs.
env_var: Environment variable to set based on GPU runtime.
"""

global last_set_gpu_ids
if last_set_gpu_ids == gpu_ids:
global last_set_visible_ids
if last_set_visible_ids == visible_ids:
return # optimization: already set

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids])
last_set_gpu_ids = gpu_ids
os.environ[env_var] = ",".join([str(i) for i in visible_ids])
last_set_visible_ids = visible_ids


def get_neuron_core_constraint_name():
"""Get the name of the constraint that represents the AWS Neuron core accelerator.
Returns:
(str) The constraint name.
"""
return get_constraint_name(AWS_NEURON_CORE)


def get_constraint_name(pretty_name: str):
constraint_name = f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
return constraint_name


def resources_from_ray_options(options_dict: Dict[str, Any]) -> Dict[str, Any]:
Expand Down Expand Up @@ -397,6 +446,7 @@ def resources_from_ray_options(options_dict: Dict[str, Any]) -> Dict[str, Any]:
resources["memory"] = int(memory)
if object_store_memory is not None:
resources["object_store_memory"] = object_store_memory
# TODO: Confirm if value is valid
if accelerator_type is not None:
resources[
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}"
Expand Down
Loading

0 comments on commit faf7e9f

Please sign in to comment.