Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto-detection of accelerator_type for aws_accelerators trn1_inf #37998

Merged
merged 42 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
faf7e9f
Refactor to support accelerator_type instead of num_gpus
chappidim Aug 4, 2023
dc9472d
Fix UTs, resource regex and bug fixes
chappidim Aug 5, 2023
d28744a
Improve UTs, add NC to custom_unit_instance_resources
chappidim Aug 7, 2023
bb41d2b
Bug fix on NoneType to fix UTs
chappidim Aug 7, 2023
c7818d7
Helper method, UT fixes, global var
chappidim Aug 8, 2023
54c00af
Merge branch 'ray-project:master' into feat-trn1-accel
chappidim Aug 8, 2023
f2f0d06
Refactor, validate on remote, UT coverage for accelerator
chappidim Aug 8, 2023
76bea58
Bug fixes on UT reg new annotations
chappidim Aug 9, 2023
3de32aa
Merge branch 'feat-trn1-accel' of https://github.com/chappidm/ray int…
chappidim Aug 9, 2023
e137b57
Bug fix on neuron_core options validation
chappidim Aug 9, 2023
0949995
[Another] Bug fix on neuron_core options validation
chappidim Aug 9, 2023
d1b2b96
[A3] Bug fix on neuron_core options validation
chappidim Aug 10, 2023
110f875
Merge branch 'master' into feat-trn1-accel
scv119 Aug 10, 2023
63c2e75
[A1] Fix docs using explicit import
chappidim Aug 10, 2023
71f8a71
[A2] Fix docs using explicit import
chappidim Aug 10, 2023
049b584
Annotation fixes and scheduling UT
chappidim Aug 11, 2023
f7a477d
Typo on code-block
chappidim Aug 11, 2023
2b51e1a
Fix docs build on code-block
chappidim Aug 11, 2023
83da33b
Bug fix, add UT coverage for placement_group
chappidim Aug 14, 2023
0b0e4bc
Merge branch 'master' into feat-trn1-accel
chappidim Aug 14, 2023
e62974c
Refactor with neuron_core_ids API removal
chappidim Aug 15, 2023
07637ea
Docs addition and bug fixes
chappidim Aug 15, 2023
362ef29
Merge branch 'master' into feat-trn1-accel
chappidim Aug 15, 2023
9b2de66
Merge branch 'master' into feat-trn1-accel
chappidim Aug 15, 2023
2972a4a
Docs and bug fix on local worker
chappidim Aug 15, 2023
228f896
Merge branch 'feat-trn1-accel' of https://github.com/chappidm/ray int…
chappidim Aug 15, 2023
4ee2597
Refactor and support get_gpu_and_accelerator_ids
chappidim Aug 16, 2023
1c46140
Refactor to get_resource_ids
chappidim Aug 16, 2023
73307e2
Merge branch 'master' into feat-trn1-accel
chappidim Aug 16, 2023
e8bd00b
[Docs]Fix line length
chappidim Aug 16, 2023
99e1ae2
Merge branch 'master' into feat-trn1-accel
chappidim Aug 16, 2023
1e7e94f
[Doc] Introduction to auto-detection
chappidim Aug 16, 2023
eb51da0
Merge branch 'master' into feat-trn1-accel
chappidim Aug 16, 2023
ffe5078
[Doc] Add experimental tag
chappidim Aug 16, 2023
471f3a7
Merge branch 'master' into feat-trn1-accel
chappidim Aug 16, 2023
594ce80
Merge branch 'master' into feat-trn1-accel
scv119 Aug 16, 2023
9575f87
Update doc/source/ray-core/tasks/using-ray-with-gpus.rst
scv119 Aug 16, 2023
774edc7
Update doc/source/ray-core/tasks/using-ray-with-gpus.rst
scv119 Aug 16, 2023
2810ac8
Update doc/source/ray-core/tasks/using-ray-with-gpus.rst
scv119 Aug 16, 2023
29e8b63
Refactor num_neuron_cores to neuron_cores
chappidim Aug 17, 2023
ac09f78
Merge branch 'master' into feat-trn1-accel
chappidim Aug 17, 2023
c66c86e
Merge branch 'master' into feat-trn1-accel
chappidim Aug 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions python/ray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def _configure_system():
get,
get_actor,
get_gpu_ids,
get_neuron_core_ids,
init,
is_initialized,
put,
Expand Down Expand Up @@ -197,6 +198,7 @@ def __getattr__(self, attr):
"get",
"get_actor",
"get_gpu_ids",
"get_neuron_core_ids",
"init",
"is_initialized",
"java_actor_class",
Expand All @@ -223,6 +225,7 @@ def __getattr__(self, attr):
"get",
"get_actor",
"get_gpu_ids",
"get_neuron_core_ids",
"kill",
"put",
"wait",
Expand Down
17 changes: 17 additions & 0 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,24 @@ def env_set_by_user(key):

LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]

# Accelerator constants
NOSET_AWS_NEURON_VISIBLE_CORES_ENV_VAR = (
rkooo567 marked this conversation as resolved.
Show resolved Hide resolved
"RAY_EXPERIMENTAL_NOSET_AWS_NEURON_VISIBLE_CORES"
)
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
NUM_NEURON_CORES = "num_neuron_cores"
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
# Subject to removal after the information is available via public API
AWS_NEURON_INSTANCE_MAP = {
"trn1.2xlarge": 2,
"trn1.32xlarge": 32,
"trn1n.32xlarge": 32,
"inf2.2xlarge": 2,
"inf2.8xlarge": 2,
"inf2.24xlarge": 12,
"inf2.48xlarge": 24,
}
RAY_WORKER_NICENESS = "RAY_worker_niceness"

# Default max_retries option in @ray.remote for non-actor
Expand Down
68 changes: 55 additions & 13 deletions python/ray/_private/resource_spec.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib.util
import json
import logging
import os
import re
Expand All @@ -9,6 +10,7 @@

import ray
import ray._private.ray_constants as ray_constants
from ray._private.utils import get_neuron_core_constraint_name

try:
import GPUtil
Expand Down Expand Up @@ -198,6 +200,24 @@ def resolve(self, is_head: bool, node_ip_address: Optional[str] = None):
except Exception:
logger.exception("Could not parse gpu information.")

# 1. Check if the user specified num_neuron_cores in resources
num_neuron_cores = resources.get(ray_constants.NUM_NEURON_CORES, None)
# 2. Auto-detect num_neuron_cores if not specified in resources
if num_neuron_cores is None:
num_neuron_cores = _autodetect_aws_neuron_cores()
if num_neuron_cores is not None:
if num_gpus is not None and num_gpus > 0:
raise ValueError("Cannot specify both num_gpus and num_neuron_cores.")

# 3. Update accelerator_type and num_neuron_cores with
# number of neuron cores detected or configured.
resources.update(
{
ray_constants.NUM_NEURON_CORES: num_neuron_cores,
get_neuron_core_constraint_name(): num_neuron_cores,
}
)

# Choose a default object store size.
system_memory = ray._private.utils.get_system_memory()
avail_memory = ray._private.utils.estimate_available_memory()
Expand Down Expand Up @@ -300,17 +320,42 @@ def _autodetect_num_gpus():
return result


def _autodetect_aws_neuron_cores():
"""
Attempt to detect the number of Neuron cores on this machine.

Returns:
The number of Neuron cores if any were detected, otherwise None.
"""
result = None
if sys.platform.startswith("linux") and os.path.isdir("/opt/aws/neuron/bin/"):
result = _get_neuron_core_count()
return result


def _get_neuron_core_count():
neuron_path = "/opt/aws/neuron/bin/"
nc_count: int = 0
result = subprocess.run(
[os.path.join(neuron_path, "neuron-ls"), "--json-output"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if result.returncode == 0 and result.stdout:
json_out = json.loads(result.stdout)
for neuron_device in json_out:
nc_count += neuron_device["nc_count"]
return nc_count


def _get_gpu_types_gputil():
gpu_list = GPUtil.getGPUs()
if len(gpu_list) > 0:
gpu_list_names = [gpu.name for gpu in gpu_list]
info_str = gpu_list_names.pop()
pretty_name = _pretty_gpu_name(info_str)
pretty_name = _pretty_nvidia_gpu_name(info_str)
if pretty_name:
constraint_name = (
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
)
return {constraint_name: 1}
return {ray._private.utils.get_constraint_name(pretty_name): 1}
return {}


Expand All @@ -336,11 +381,8 @@ def _constraints_from_gpu_info(info_str: str):
if k.strip() == "Model":
full_model_name = v.strip()
break
pretty_name = _pretty_gpu_name(full_model_name)
if pretty_name:
constraint_name = f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
return {constraint_name: 1}
return {}
pretty_name = _pretty_nvidia_gpu_name(full_model_name)
return {ray._private.utils.get_constraint_name(pretty_name): 1}


def _get_gpu_info_string():
Expand All @@ -364,11 +406,11 @@ def _get_gpu_info_string():

# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have
# the form "Tesla V100-SXM2-16GB" or "Tesla K80").
GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")
NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")


def _pretty_gpu_name(name):
def _pretty_nvidia_gpu_name(name):
if name is None:
return None
match = GPU_NAME_PATTERN.match(name)
match = NVIDIA_GPU_NAME_PATTERN.match(name)
return match.group(1) if match else None
86 changes: 68 additions & 18 deletions python/ray/_private/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@
from ray.core.generated.runtime_env_common_pb2 import (
RuntimeEnvInfo as ProtoRuntimeEnvInfo,
)
from ray.util.accelerators import AWS_NEURON_CORE

if TYPE_CHECKING:
from ray.runtime_env import RuntimeEnv


pwd = None
if sys.platform != "win32":
import pwd
Expand All @@ -64,15 +64,13 @@
win32_job = None
win32_AssignProcessToJobObject = None


ENV_DISABLE_DOCKER_CPU_WARNING = "RAY_DISABLE_DOCKER_CPU_WARNING" in os.environ
_PYARROW_VERSION = None

# This global variable is used for testing only
_CALLED_FREQ = defaultdict(lambda: 0)
_CALLED_FREQ_LOCK = threading.Lock()


PLACEMENT_GROUP_INDEXED_BUNDLED_RESOURCE_PATTERN = re.compile(
r"(.+)_group_(\d+)_([0-9a-zA-Z]+)"
)
Expand Down Expand Up @@ -279,29 +277,45 @@ def compute_driver_id_from_job(job_id):


def get_cuda_visible_devices():
"""Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable.
"""
Get the device IDs using CUDA_VISIBLE_DEVICES environment variable.
"""
return _get_visible_ids(env_var="CUDA_VISIBLE_DEVICES")


def get_aws_neuron_core_visible_ids():
"""
Get the device IDs using NEURON_RT_VISIBLE_CORES environment variable.
"""
return _get_visible_ids(env_var="NEURON_RT_VISIBLE_CORES")


def _get_visible_ids(env_var: str):
"""Get the device IDs from defined environment variable.
Args:
env_var: Environment variable to set based on the accelerator runtime.
chappidim marked this conversation as resolved.
Show resolved Hide resolved

Returns:
devices (List[str]): If CUDA_VISIBLE_DEVICES is set, returns a
list of strings representing the IDs of the visible GPUs.
devices (List[str]): If environment variable is set, returns a
list of strings representing the IDs of the visible devices or cores.
If it is not set or is set to NoDevFiles, returns empty list.
"""
gpu_ids_str = os.environ.get("CUDA_VISIBLE_DEVICES", None)
visible_ids_str = os.environ.get(env_var, None)
chappidim marked this conversation as resolved.
Show resolved Hide resolved

if gpu_ids_str is None:
if visible_ids_str is None:
return None

if gpu_ids_str == "":
if visible_ids_str == "":
return []

if gpu_ids_str == "NoDevFiles":
if visible_ids_str == "NoDevFiles":
return []

# GPU identifiers are given as strings representing integers or UUIDs.
return list(gpu_ids_str.split(","))
# Identifiers are given as strings representing integers or UUIDs.
return list(visible_ids_str.split(","))


last_set_gpu_ids = None
last_set_visible_ids = None


def set_omp_num_threads_if_unset() -> bool:
Expand Down Expand Up @@ -350,16 +364,51 @@ def set_cuda_visible_devices(gpu_ids):
Args:
gpu_ids (List[str]): List of strings representing GPU IDs.
"""

if os.environ.get(ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR):
return
set_visible_ids(gpu_ids, "CUDA_VISIBLE_DEVICES")


def set_aws_neuron_core_visible_ids(core_ids):
"""Set the NEURON_RT_VISIBLE_CORES environment variable.

Args:
core_ids (List[str]): List of strings representing core IDs.
"""
if os.environ.get(ray_constants.NOSET_AWS_NEURON_VISIBLE_CORES_ENV_VAR):
chappidim marked this conversation as resolved.
Show resolved Hide resolved
rkooo567 marked this conversation as resolved.
Show resolved Hide resolved
return
set_visible_ids(core_ids, "NEURON_RT_VISIBLE_CORES")


def set_visible_ids(visible_ids, env_var: str):
"""Set the environment variable passed based on accelerator runtime.

Args:
visible_ids (List[str]): List of strings representing GPU IDs.
env_var: Environment variable to set based on GPU runtime.

"""

global last_set_gpu_ids
if last_set_gpu_ids == gpu_ids:
global last_set_visible_ids
if last_set_visible_ids == visible_ids:
chappidim marked this conversation as resolved.
Show resolved Hide resolved
return # optimization: already set

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids])
last_set_gpu_ids = gpu_ids
os.environ[env_var] = ",".join([str(i) for i in visible_ids])
last_set_visible_ids = visible_ids


def get_neuron_core_constraint_name():
"""Get the name of the constraint that represents the AWS Neuron core accelerator.

Returns:
(str) The constraint name.
"""
return get_constraint_name(AWS_NEURON_CORE)


def get_constraint_name(pretty_name: str):
constraint_name = f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
return constraint_name


def resources_from_ray_options(options_dict: Dict[str, Any]) -> Dict[str, Any]:
Expand Down Expand Up @@ -397,6 +446,7 @@ def resources_from_ray_options(options_dict: Dict[str, Any]) -> Dict[str, Any]:
resources["memory"] = int(memory)
if object_store_memory is not None:
resources["object_store_memory"] = object_store_memory
# TODO: Confirm if value is valid
if accelerator_type is not None:
resources[
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}"
Expand Down
Loading