Skip to content

Commit

Permalink
[Ray Core] Adds in Google Cloud TPUs as a native Resource (#38669) (#…
Browse files Browse the repository at this point in the history
…39352)

* [Ray Core] Adds in Google Cloud TPUs as a native Resource (#38669)

The issue below has more details, but at a high level this change addresses the feature request of adding in TPUs as a native resource within Ray.

---------

Signed-off-by: allenwang28 <[email protected]>
Signed-off-by: Archit Kulkarni <[email protected]>
Co-authored-by: Archit Kulkarni <[email protected]>

* Fix docstring

Signed-off-by: Archit Kulkarni <[email protected]>

---------

Signed-off-by: allenwang28 <[email protected]>
Signed-off-by: Archit Kulkarni <[email protected]>
Co-authored-by: Allen Wang <[email protected]>
  • Loading branch information
architkulkarni and allenwang28 authored Sep 7, 2023
1 parent 8e21832 commit a94f97d
Show file tree
Hide file tree
Showing 17 changed files with 623 additions and 121 deletions.
219 changes: 175 additions & 44 deletions python/ray/_private/accelerator.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,118 @@
import json
import os
import glob
import subprocess
import sys
from typing import Optional
import requests
import logging
import ray._private.ray_constants as ray_constants
import ray._private.utils as utils
import re
from typing import Iterable, Optional


def update_resources_with_accelerator_type(resources: dict):
"""Update the resources dictionary with the accelerator type and custom
resources.
Currently, we support AWS NeuronCore (neuron_cores /
accelerator_type:aws-neuron-core) detection and configuration.
Currently, we support detection and configuration of:
- AWS NeuronCore (neuron_cores / accelerator_type:aws-neuron-core)
- Google Cloud TPUs (TPU / accelerator_type:TPU-V*)
Args:
resources: Resources dictionary to be updated with
accelerator type and custom resources.
"""
_detect_and_configure_aws_neuron_core(resources)
# Autodetect AWS NeuronCore
_detect_and_configure_custom_accelerator(
resources=resources,
accelerator_key=ray_constants.NEURON_CORES,
accelerator_type=utils.get_neuron_core_constraint_name(),
visible_ids=utils.get_aws_neuron_core_visible_ids(),
autodetected_accelerators=_autodetect_aws_neuron_cores(),
visible_devices_env_variable=ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR,
)
# Autodetect Google Cloud TPUs
_detect_and_configure_custom_accelerator(
resources=resources,
accelerator_key=ray_constants.TPU,
accelerator_type=_autodetect_tpu_version(),
visible_ids=utils.get_tpu_visible_chips(),
autodetected_accelerators=_autodetect_num_tpus(),
visible_devices_env_variable=ray_constants.TPU_VISIBLE_CHIPS_ENV_VAR,
)


def _detect_and_configure_aws_neuron_core(resources: dict):
"""Configuration and auto-detection of AWS NeuronCore accelerator type
and number of NeuronCore (neuron_cores).
def _detect_and_configure_custom_accelerator(
resources: dict,
accelerator_key: str,
accelerator_type: str,
visible_ids: Optional[Iterable[str]],
autodetected_accelerators: int,
visible_devices_env_variable: str,
):
"""Configure and autodetect custom accelerators counts and types.
If the number of NeuronCore is not specified in the resources, this
function will try to detect the number of NeuronCore.
If the number of accelerators is not specified in the resources, this
function will try to detect the number of accelerators.
If the number of NeuronCore is specified in the resources, this
function will check if the number of NeuronCore is greater than the
number of visible NeuronCore and raise an error if it is true.
If the number of accelerators is specified in the resources, this
function will check if the number of accelerators is greater than the
number of visible devices and raise an error if it is true.
If the number of NeuronCore is greater than the number of visible
NeuronCore, this function will raise an error.
If the number of accelerators is greater than the number of visible
devices, this function will raise an error.
Lastly, update accelerator_type and neuron_cores in resources.
Lastly, update accelerator_type and number of accelerators in resources.
Args:
resources: Resources dictionary to be updated with
NeuronCore accelerator type and custom resources(neuron_cores).
resources: Resources dictionary to be updated with the custom
accelerator type and resource count.
accelerator_key: The key used to access the number of accelerators
within `resources`. This can be:
ray_constants.NEURON_CORES or ray_constants.TPU
accelerator_type: The name of the accelerator type. This
is the unique identifier of the accelerator version, e.g.
ray_constants.AWS_NEURON_CORE or ray_constants.GOOGLE_TPU_V4.
visible_ids: The visible IDs specified by the user. This is typically
controlled by an environment variable, e.g. NEURON_RT_VISIBLE_CORES
or TPU_VISIBLE_CHIPS.
autodetected_accelerators: The number of accelerators autodetected
on the machine.
visible_devices_env_variable: The environment variable a user uses
to specify which devices are visible.
Raises:
ValueError: If the number of NeuronCore is greater than the number of
visible NeuronCore.
ValueError: If the number of requested accelerator chips is greater
than the number of visible accelerator chips.
"""
import ray._private.ray_constants as ray_constants
import ray._private.utils as utils

# AWS NeuronCore detection and configuration
# 1. Check if the user specified neuron_cores in resources
neuron_cores = resources.get(ray_constants.NEURON_CORES, None)
# 2. Check if the user specified NEURON_RT_VISIBLE_CORES
neuron_core_ids = utils.get_aws_neuron_core_visible_ids()
# Custom accelerator detection and configuration
# 1. Check if the user specified accelerator_count in resources
accelerator_count = resources.get(accelerator_key, None)
# 2. Check if the user specified visible cores/chips (within `visible_ids`)
if (
neuron_cores is not None
and neuron_core_ids is not None
and neuron_cores > len(neuron_core_ids)
accelerator_count is not None
and visible_ids is not None
and accelerator_count > len(visible_ids)
):
raise ValueError(
f"Attempting to start raylet with {neuron_cores} "
f"neuron cores, but NEURON_RT_VISIBLE_CORES contains "
f"{neuron_core_ids}."
f"Attempting to start raylet with {accelerator_count} "
f"{accelerator_key}, but f{visible_devices_env_variable} "
f"contains {visible_ids}."
)
# 3. Auto-detect neuron_cores if not specified in resources
if neuron_cores is None:
neuron_cores = _autodetect_aws_neuron_cores()
# Don't use more neuron cores than allowed by NEURON_RT_VISIBLE_CORES.
if neuron_cores is not None and neuron_core_ids is not None:
neuron_cores = min(neuron_cores, len(neuron_core_ids))
if neuron_cores is not None:
# 4. Update accelerator_type and neuron_cores with
# number of neuron cores detected or configured.
# 3. Auto-detect accelerator_count if not specified in resources
if accelerator_count is None:
accelerator_count = autodetected_accelerators
# Don't use more resources than allowed by the user's pre-set values.
if accelerator_count is not None and visible_ids is not None:
accelerator_count = min(accelerator_count, len(visible_ids))
if accelerator_count is not None:
# 4. Update accelerator_type and accelerator_count with
# number of accelerators detected or configured.
resources.update(
{
ray_constants.NEURON_CORES: neuron_cores,
utils.get_neuron_core_constraint_name(): neuron_cores,
accelerator_key: accelerator_count,
accelerator_type: accelerator_count,
}
)

Expand Down Expand Up @@ -109,3 +148,95 @@ def _get_neuron_core_count() -> int:
for neuron_device in json_out:
nc_count += neuron_device.get("nc_count", 0)
return nc_count


def _autodetect_num_tpus() -> int:
"""Attempt to detect the number of TPUs on this machine.
TPU chips are represented as devices within `/dev/`, either as
`/dev/accel*` or `/dev/vfio/*`.
Returns:
The number of TPUs if any were detected, otherwise 0.
"""
accel_files = glob.glob("/dev/accel*")
if accel_files:
return len(accel_files)

try:
vfio_entries = os.listdir("/dev/vfio")
numeric_entries = [int(entry) for entry in vfio_entries if entry.isdigit()]
return len(numeric_entries)
except FileNotFoundError as e:
logging.info("Failed to detect number of TPUs: %s", e)
return 0


def _autodetect_tpu_version() -> Optional[str]:
"""Attempt to detect the TPU version.
Individual TPU VMs within a TPU pod must know what type
of pod it is a part of. This is necessary for the
ML framework to work properly.
The logic is different if the TPU was provisioned via:
```
gcloud tpus tpu-vm create ...
```
(i.e. a GCE VM), vs through GKE:
- GCE VMs will always have a metadata server to poll this info
- GKE VMS will have environment variables preset.
Returns:
A string representing the TPU version,
e.g. "TPU-V2", "TPU-V3", "TPU-V4" if applicable, else None.
"""

def accelerator_type_to_version(accelerator_type: str) -> str:
assert_tpu_accelerator_type(accelerator_type)
return "TPU-" + str(accelerator_type.split("-")[0]).upper()

# GKE-based check
accelerator_type = os.getenv(
ray_constants.RAY_GKE_TPU_ACCELERATOR_TYPE_ENV_VAR, None
)
if accelerator_type is not None:
return accelerator_type_to_version(accelerator_type)

# GCE-based VM check
try:
accelerator_type_request = requests.get(
ray_constants.RAY_GCE_TPU_ACCELERATOR_ENDPOINT,
headers=ray_constants.RAY_GCE_TPU_HEADERS,
)
if accelerator_type_request.status_code == 200:
return accelerator_type_to_version(accelerator_type_request.text)
except requests.RequestException as e:
logging.info("Unable to poll TPU GCE metadata: %s", e)

return None


def assert_tpu_accelerator_type(accelerator_type: str):
"""Assert that the inputed accelerator_type is formatted correctly.
The accelerator_type field follows a form of v{generation}-{cores/chips}.
See the following for more information:
https://cloud.google.com/sdk/gcloud/reference/compute/tpus/tpu-vm/accelerator-types/describe
Args:
accelerator_type: The string representation of the accelerator type
to be asserted for validity.
Raises:
ValueError: If the provided accelerator_type is malformed.
"""
expected_pattern = re.compile(r"^v\d+[a-zA-Z]*-\d+$")
if not expected_pattern.match(accelerator_type):
raise ValueError(
"`acceleratorType` should match v(generation)-(cores/chips). "
f"Got {accelerator_type}."
)
30 changes: 30 additions & 0 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,10 +400,16 @@ def env_set_by_user(key):
"RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES"
)
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS"

CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"

NEURON_CORES = "neuron_cores"
GPU = "GPU"
TPU = "TPU"

# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
# Subject to removal after the information is available via public API
Expand Down Expand Up @@ -479,9 +485,33 @@ def gcs_actor_scheduling_enabled():
RAY_DEFAULT_LABEL_KEYS_PREFIX = "ray.io/"

RAY_TPU_MAX_CONCURRENT_CONNECTIONS_ENV_VAR = "RAY_TPU_MAX_CONCURRENT_ACTIVE_CONNECTIONS"
RAY_GKE_TPU_ACCELERATOR_TYPE_ENV_VAR = "TPU_ACCELERATOR_TYPE"

# Constants for accessing the `accelerator-type` from TPU VM
# instance metadata.
# See https://cloud.google.com/compute/docs/metadata/overview
# for more details about VM instance metadata.
RAY_GCE_TPU_ACCELERATOR_ENDPOINT = (
"http://metadata.google.internal/computeMetadata/"
"v1/instance/attributes/accelerator-type"
)
RAY_GCE_TPU_HEADERS = {"Metadata-Flavor": "Google"}

# TPU VMs come with 4 chips per host and 2 tensorcores per chip.
# For more details: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm
RAY_TPU_NUM_CHIPS_PER_HOST = 4
RAY_TPU_CORES_PER_CHIP = 2

# The following defines environment variables that allow
# us to access a subset of TPU visible chips.
#
# See: https://github.com/google/jax/issues/14977 for an example/more details.
TPU_VALID_CHIP_OPTIONS = (1, 2, 4)
TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR = "TPU_CHIPS_PER_HOST_BOUNDS"
TPU_CHIPS_PER_HOST_BOUNDS_1_CHIP_CONFIG = "1,1,1"
TPU_CHIPS_PER_HOST_BOUNDS_2_CHIP_CONFIG = "1,2,1"

TPU_HOST_BOUNDS_ENV_VAR = "TPU_HOST_BOUNDS"
TPU_SINGLE_HOST_BOUNDS = "1,1,1"

RAY_NODE_IP_FILENAME = "node_ip_address.json"
Loading

0 comments on commit a94f97d

Please sign in to comment.