[Ray Core] Adds in Google Cloud TPUs as a native Resource (#38669) (#…

…39352) * [Ray Core] Adds in Google Cloud TPUs as a native Resource (#38669) The issue below has more details, but at a high level this change addresses the feature request of adding in TPUs as a native resource within Ray. --------- Signed-off-by: allenwang28 <[email protected]> Signed-off-by: Archit Kulkarni <[email protected]> Co-authored-by: Archit Kulkarni <[email protected]> * Fix docstring Signed-off-by: Archit Kulkarni <[email protected]> --------- Signed-off-by: allenwang28 <[email protected]> Signed-off-by: Archit Kulkarni <[email protected]> Co-authored-by: Allen Wang <[email protected]>
ray-project · Sep 7, 2023 · a94f97d · a94f97d
1 parent 8e21832
commit a94f97d
Show file tree

Hide file tree

Showing 17 changed files with 623 additions and 121 deletions.
diff --git a/python/ray/_private/accelerator.py b/python/ray/_private/accelerator.py
@@ -1,79 +1,118 @@
 import json
 import os
+import glob
 import subprocess
 import sys
-from typing import Optional
+import requests
+import logging
+import ray._private.ray_constants as ray_constants
+import ray._private.utils as utils
+import re
+from typing import Iterable, Optional
 
 
 def update_resources_with_accelerator_type(resources: dict):
     """Update the resources dictionary with the accelerator type and custom
     resources.
 
-    Currently, we support AWS NeuronCore (neuron_cores /
-    accelerator_type:aws-neuron-core) detection and configuration.
+    Currently, we support detection and configuration of:
+    - AWS NeuronCore (neuron_cores / accelerator_type:aws-neuron-core)
+    - Google Cloud TPUs (TPU / accelerator_type:TPU-V*)
 
     Args:
         resources: Resources dictionary to be updated with
         accelerator type and custom resources.
     """
-    _detect_and_configure_aws_neuron_core(resources)
+    # Autodetect AWS NeuronCore
+    _detect_and_configure_custom_accelerator(
+        resources=resources,
+        accelerator_key=ray_constants.NEURON_CORES,
+        accelerator_type=utils.get_neuron_core_constraint_name(),
+        visible_ids=utils.get_aws_neuron_core_visible_ids(),
+        autodetected_accelerators=_autodetect_aws_neuron_cores(),
+        visible_devices_env_variable=ray_constants.NEURON_RT_VISIBLE_CORES_ENV_VAR,
+    )
+    # Autodetect Google Cloud TPUs
+    _detect_and_configure_custom_accelerator(
+        resources=resources,
+        accelerator_key=ray_constants.TPU,
+        accelerator_type=_autodetect_tpu_version(),
+        visible_ids=utils.get_tpu_visible_chips(),
+        autodetected_accelerators=_autodetect_num_tpus(),
+        visible_devices_env_variable=ray_constants.TPU_VISIBLE_CHIPS_ENV_VAR,
+    )
 
 
-def _detect_and_configure_aws_neuron_core(resources: dict):
-    """Configuration and auto-detection of AWS NeuronCore accelerator type
-    and number of NeuronCore (neuron_cores).
+def _detect_and_configure_custom_accelerator(
+    resources: dict,
+    accelerator_key: str,
+    accelerator_type: str,
+    visible_ids: Optional[Iterable[str]],
+    autodetected_accelerators: int,
+    visible_devices_env_variable: str,
+):
+    """Configure and autodetect custom accelerators counts and types.
 
-    If the number of NeuronCore is not specified in the resources, this
-    function will try to detect the number of NeuronCore.
+    If the number of accelerators is not specified in the resources, this
+    function will try to detect the number of accelerators.
 
-    If the number of NeuronCore is specified in the resources, this
-    function will check if the number of NeuronCore is greater than the
-    number of visible NeuronCore and raise an error if it is true.
+    If the number of accelerators is specified in the resources, this
+    function will check if the number of accelerators is greater than the
+    number of visible devices and raise an error if it is true.
 
-    If the number of NeuronCore is greater than the number of visible
-    NeuronCore, this function will raise an error.
+    If the number of accelerators is greater than the number of visible
+    devices, this function will raise an error.
 
-    Lastly, update accelerator_type and neuron_cores in resources.
+    Lastly, update accelerator_type and number of accelerators in resources.
 
     Args:
-        resources: Resources dictionary to be updated with
-        NeuronCore accelerator type and custom resources(neuron_cores).
+        resources: Resources dictionary to be updated with the custom
+            accelerator type and resource count.
+        accelerator_key: The key used to access the number of accelerators
+            within `resources`. This can be:
+            ray_constants.NEURON_CORES or ray_constants.TPU
+        accelerator_type: The name of the accelerator type. This
+            is the unique identifier of the accelerator version, e.g.
+            ray_constants.AWS_NEURON_CORE or ray_constants.GOOGLE_TPU_V4.
+        visible_ids: The visible IDs specified by the user. This is typically
+            controlled by an environment variable, e.g. NEURON_RT_VISIBLE_CORES
+            or TPU_VISIBLE_CHIPS.
+        autodetected_accelerators: The number of accelerators autodetected
+            on the machine.
+        visible_devices_env_variable: The environment variable a user uses
+            to specify which devices are visible.
 
     Raises:
-        ValueError: If the number of NeuronCore is greater than the number of
-            visible NeuronCore.
+        ValueError: If the number of requested accelerator chips is greater
+            than the number of visible accelerator chips.
     """
-    import ray._private.ray_constants as ray_constants
-    import ray._private.utils as utils
-
-    # AWS NeuronCore detection and configuration
-    # 1. Check if the user specified neuron_cores in resources
-    neuron_cores = resources.get(ray_constants.NEURON_CORES, None)
-    # 2. Check if the user specified NEURON_RT_VISIBLE_CORES
-    neuron_core_ids = utils.get_aws_neuron_core_visible_ids()
+    # Custom accelerator detection and configuration
+    # 1. Check if the user specified accelerator_count in resources
+    accelerator_count = resources.get(accelerator_key, None)
+    # 2. Check if the user specified visible cores/chips (within `visible_ids`)
     if (
-        neuron_cores is not None
-        and neuron_core_ids is not None
-        and neuron_cores > len(neuron_core_ids)
+        accelerator_count is not None
+        and visible_ids is not None
+        and accelerator_count > len(visible_ids)
     ):
         raise ValueError(
-            f"Attempting to start raylet with {neuron_cores} "
-            f"neuron cores, but NEURON_RT_VISIBLE_CORES contains "
-            f"{neuron_core_ids}."
+            f"Attempting to start raylet with {accelerator_count} "
+            f"{accelerator_key}, but f{visible_devices_env_variable} "
+            f"contains {visible_ids}."
         )
-    # 3. Auto-detect neuron_cores if not specified in resources
-    if neuron_cores is None:
-        neuron_cores = _autodetect_aws_neuron_cores()
-        # Don't use more neuron cores than allowed by NEURON_RT_VISIBLE_CORES.
-        if neuron_cores is not None and neuron_core_ids is not None:
-            neuron_cores = min(neuron_cores, len(neuron_core_ids))
-    if neuron_cores is not None:
-        # 4. Update accelerator_type and neuron_cores with
-        # number of neuron cores detected or configured.
+    # 3. Auto-detect accelerator_count if not specified in resources
+    if accelerator_count is None:
+        accelerator_count = autodetected_accelerators
+        # Don't use more resources than allowed by the user's pre-set values.
+        if accelerator_count is not None and visible_ids is not None:
+            accelerator_count = min(accelerator_count, len(visible_ids))
+    if accelerator_count is not None:
+        # 4. Update accelerator_type and accelerator_count with
+        # number of accelerators detected or configured.
         resources.update(
             {
-                ray_constants.NEURON_CORES: neuron_cores,
-                utils.get_neuron_core_constraint_name(): neuron_cores,
+                accelerator_key: accelerator_count,
+                accelerator_type: accelerator_count,
             }
         )
 
@@ -109,3 +148,95 @@ def _get_neuron_core_count() -> int:
         for neuron_device in json_out:
             nc_count += neuron_device.get("nc_count", 0)
     return nc_count
+
+
+def _autodetect_num_tpus() -> int:
+    """Attempt to detect the number of TPUs on this machine.
+
+    TPU chips are represented as devices within `/dev/`, either as
+    `/dev/accel*` or `/dev/vfio/*`.
+
+    Returns:
+        The number of TPUs if any were detected, otherwise 0.
+    """
+    accel_files = glob.glob("/dev/accel*")
+    if accel_files:
+        return len(accel_files)
+
+    try:
+        vfio_entries = os.listdir("/dev/vfio")
+        numeric_entries = [int(entry) for entry in vfio_entries if entry.isdigit()]
+        return len(numeric_entries)
+    except FileNotFoundError as e:
+        logging.info("Failed to detect number of TPUs: %s", e)
+        return 0
+
+
+def _autodetect_tpu_version() -> Optional[str]:
+    """Attempt to detect the TPU version.
+
+    Individual TPU VMs within a TPU pod must know what type
+    of pod it is a part of. This is necessary for the
+    ML framework to work properly.
+
+    The logic is different if the TPU was provisioned via:
+    ```
+    gcloud tpus tpu-vm create ...
+    ```
+    (i.e. a GCE VM), vs through GKE:
+    - GCE VMs will always have a metadata server to poll this info
+    - GKE VMS will have environment variables preset.
+
+    Returns:
+        A string representing the TPU version,
+        e.g. "TPU-V2", "TPU-V3", "TPU-V4" if applicable, else None.
+
+    """
+
+    def accelerator_type_to_version(accelerator_type: str) -> str:
+        assert_tpu_accelerator_type(accelerator_type)
+        return "TPU-" + str(accelerator_type.split("-")[0]).upper()
+
+    # GKE-based check
+    accelerator_type = os.getenv(
+        ray_constants.RAY_GKE_TPU_ACCELERATOR_TYPE_ENV_VAR, None
+    )
+    if accelerator_type is not None:
+        return accelerator_type_to_version(accelerator_type)
+
+    # GCE-based VM check
+    try:
+        accelerator_type_request = requests.get(
+            ray_constants.RAY_GCE_TPU_ACCELERATOR_ENDPOINT,
+            headers=ray_constants.RAY_GCE_TPU_HEADERS,
+        )
+        if accelerator_type_request.status_code == 200:
+            return accelerator_type_to_version(accelerator_type_request.text)
+    except requests.RequestException as e:
+        logging.info("Unable to poll TPU GCE metadata: %s", e)
+
+    return None
+
+
+def assert_tpu_accelerator_type(accelerator_type: str):
+    """Assert that the inputed accelerator_type is formatted correctly.
+
+    The accelerator_type field follows a form of v{generation}-{cores/chips}.
+
+    See the following for more information:
+    https://cloud.google.com/sdk/gcloud/reference/compute/tpus/tpu-vm/accelerator-types/describe
+
+    Args:
+        accelerator_type: The string representation of the accelerator type
+            to be asserted for validity.
+
+    Raises:
+        ValueError: If the provided accelerator_type is malformed.
+
+    """
+    expected_pattern = re.compile(r"^v\d+[a-zA-Z]*-\d+$")
+    if not expected_pattern.match(accelerator_type):
+        raise ValueError(
+            "`acceleratorType` should match v(generation)-(cores/chips). "
+            f"Got {accelerator_type}."
+        )
diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py
@@ -400,10 +400,16 @@ def env_set_by_user(key):
     "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES"
 )
 NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
+NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS"
+
 CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
 NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
+TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"
+
 NEURON_CORES = "neuron_cores"
 GPU = "GPU"
+TPU = "TPU"
+
 # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
 # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
 # Subject to removal after the information is available via public API
@@ -479,9 +485,33 @@ def gcs_actor_scheduling_enabled():
 RAY_DEFAULT_LABEL_KEYS_PREFIX = "ray.io/"
 
 RAY_TPU_MAX_CONCURRENT_CONNECTIONS_ENV_VAR = "RAY_TPU_MAX_CONCURRENT_ACTIVE_CONNECTIONS"
+RAY_GKE_TPU_ACCELERATOR_TYPE_ENV_VAR = "TPU_ACCELERATOR_TYPE"
+
+# Constants for accessing the `accelerator-type` from TPU VM
+# instance metadata.
+# See https://cloud.google.com/compute/docs/metadata/overview
+# for more details about VM instance metadata.
+RAY_GCE_TPU_ACCELERATOR_ENDPOINT = (
+    "http://metadata.google.internal/computeMetadata/"
+    "v1/instance/attributes/accelerator-type"
+)
+RAY_GCE_TPU_HEADERS = {"Metadata-Flavor": "Google"}
 
 # TPU VMs come with 4 chips per host and 2 tensorcores per chip.
 # For more details: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm
 RAY_TPU_NUM_CHIPS_PER_HOST = 4
 RAY_TPU_CORES_PER_CHIP = 2
+
+# The following defines environment variables that allow
+# us to access a subset of TPU visible chips.
+#
+# See: https://github.com/google/jax/issues/14977 for an example/more details.
+TPU_VALID_CHIP_OPTIONS = (1, 2, 4)
+TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR = "TPU_CHIPS_PER_HOST_BOUNDS"
+TPU_CHIPS_PER_HOST_BOUNDS_1_CHIP_CONFIG = "1,1,1"
+TPU_CHIPS_PER_HOST_BOUNDS_2_CHIP_CONFIG = "1,2,1"
+
+TPU_HOST_BOUNDS_ENV_VAR = "TPU_HOST_BOUNDS"
+TPU_SINGLE_HOST_BOUNDS = "1,1,1"
+
 RAY_NODE_IP_FILENAME = "node_ip_address.json"