-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Train] Add accelerator ids to workers and share neuron_cores by default #39091
Changes from 8 commits
94ed600
8d168f8
5baf89a
96e50f6
04ba327
f76d4a5
8acae7a
f2b974b
3264e01
50c02f9
f27d6cd
134a1ea
d6a62e1
7357315
719d455
755dc2e
0f812fc
114699a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Any | ||
|
||
import ray | ||
import ray._private.ray_constants as ray_constants | ||
from ray.data import Dataset | ||
from ray._private.ray_constants import env_integer | ||
from ray.air.config import CheckpointConfig | ||
|
@@ -27,6 +28,8 @@ | |
TRAIN_ENABLE_WORKER_SPREAD_ENV, | ||
TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, | ||
DISABLE_LAZY_CHECKPOINTING_ENV, | ||
ENABLE_SHARE_ACCELERATOR_DEVICES_ENV, | ||
SUPPORTED_ACCELERATOR_DEVICES_TO_ENV_VAR, | ||
) | ||
from ray.util.placement_group import get_current_placement_group, remove_placement_group | ||
|
||
|
@@ -153,6 +156,13 @@ def start( | |
|
||
if self._num_gpus_per_worker > 0 and share_cuda_visible_devices_enabled: | ||
self._share_cuda_visible_devices() | ||
elif self._additional_resources_per_worker: | ||
for ( | ||
accelerator, | ||
env_var, | ||
) in SUPPORTED_ACCELERATOR_DEVICES_TO_ENV_VAR.items(): | ||
if self._share_accelerator_devices_enabled(accelerator): | ||
self._share_resource_ids(accelerator, env_var) | ||
self._backend.on_start(self.worker_group, self._backend_config) | ||
except RayActorError as exc: | ||
logger.exception(str(exc)) | ||
|
@@ -245,32 +255,81 @@ def _share_cuda_visible_devices(self): | |
- Worker2: "0,1" | ||
|
||
""" | ||
self._share_resource_ids( | ||
ray_constants.GPU, ray_constants.CUDA_VISIBLE_DEVICES_ENV_VAR | ||
) | ||
|
||
node_ids_and_gpu_ids = [ | ||
(w.metadata.node_id, w.metadata.gpu_ids) for w in self.worker_group.workers | ||
] | ||
def _share_resource_ids(self, accelerator: str, env_var: str): | ||
"""Sets the given env_var on all workers. | ||
|
||
For each worker, desired will be set to the accelerator ids | ||
chappidim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
and visible to all workers on that worker's node. | ||
|
||
This allows workers on the same node to communicate with one | ||
another. | ||
|
||
Example: | ||
|
||
Setup: | ||
- Node1: | ||
- Worker1: {0, 1} | ||
- Worker2: {2, 3} | ||
- Node2: | ||
- Worker3: {0, 1} | ||
|
||
NEURON_RT_VISIBLE_CORES/TPU_VISIBLE_CHIPS/...: | ||
- Worker1: "0,1,2,3" | ||
- Worker2: "0,1,2,3" | ||
- Worker2: "0,1" | ||
|
||
Args: | ||
accelerator: The name of the accelerator. | ||
env_var: The name of the environment variable to set. | ||
""" | ||
node_ids_and_resource_ids = [ | ||
( | ||
w.metadata.node_id, | ||
w.metadata.resource_ids[accelerator], | ||
) | ||
for w in self.worker_group.workers | ||
] | ||
node_id_to_worker_id = defaultdict(set) | ||
node_id_to_gpu_ids = defaultdict(set) | ||
node_id_to_resource_ids = defaultdict(set) | ||
|
||
for worker_id, (node_id, gpu_ids) in enumerate(node_ids_and_gpu_ids): | ||
for worker_id, (node_id, resource_id) in enumerate(node_ids_and_resource_ids): | ||
chappidim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
node_id_to_worker_id[node_id].add(worker_id) | ||
node_id_to_gpu_ids[node_id].update(gpu_ids) | ||
node_id_to_resource_ids[node_id].update(resource_id) | ||
|
||
futures = [] | ||
for node_id, gpu_ids in node_id_to_gpu_ids.items(): | ||
gpu_ids = sorted(gpu_ids) | ||
all_gpu_ids = ",".join(gpu_ids) | ||
for node_id, resource_runtime_ids in node_id_to_resource_ids.items(): | ||
chappidim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
resource_runtime_ids = sorted(resource_runtime_ids) | ||
all_resource_runtime_ids = ",".join(resource_runtime_ids) | ||
|
||
def set_gpu_ids(): | ||
os.environ["CUDA_VISIBLE_DEVICES"] = all_gpu_ids | ||
def set_resource_runtime_ids(): | ||
chappidim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
os.environ[env_var] = all_resource_runtime_ids | ||
|
||
for worker_id in node_id_to_worker_id[node_id]: | ||
futures.append( | ||
self.worker_group.execute_single_async(worker_id, set_gpu_ids) | ||
self.worker_group.execute_single_async( | ||
worker_id, set_resource_runtime_ids | ||
) | ||
) | ||
ray.get(futures) | ||
|
||
def _share_accelerator_devices_enabled(self, accelerator: str): | ||
"""Whether to share NEURON_RT_VISIBLE_CORES/TPU_VISIBLE_CHIPS/.. | ||
on all workers. This is enabled by default if neuron_cores/TPU/.. are | ||
requested for workers. User can disable it by configuring the | ||
TRAIN_ENABLE_SHARE_ACCELERATOR_DEVICES to "0" | ||
""" | ||
return bool( | ||
env_integer( | ||
ENABLE_SHARE_ACCELERATOR_DEVICES_ENV, | ||
chappidim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._additional_resources_per_worker.get(accelerator, None) | ||
is not None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. super nit: Let's make this a variable so it's easier to parse through the branching logic. has_accelerator_requested = self._additional_resources_per_worker.get(accelerator) is not None |
||
) | ||
) | ||
|
||
def _create_rank_world_size_mappings(self) -> List[Dict]: | ||
"""Create rank and world size mappings for workers. | ||
There are three maps returned: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,14 +44,14 @@ class WorkerMetadata: | |
node_id: ID of the node this worker is on. | ||
node_ip: IP address of the node this worker is on. | ||
hostname: Hostname that this worker is on. | ||
gpu_ids: List of CUDA IDs available to this worker. | ||
resource_ids: Map of GPU IDs, accelerator IDs (AWS NeuronCore, ..). | ||
chappidim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pid: Process ID of this worker. | ||
""" | ||
|
||
node_id: str | ||
node_ip: str | ||
hostname: str | ||
gpu_ids: Optional[List[str]] | ||
resource_ids: Dict[str, List[str]] | ||
pid: int | ||
|
||
|
||
|
@@ -86,14 +86,13 @@ def construct_metadata() -> WorkerMetadata: | |
node_id = ray.get_runtime_context().get_node_id() | ||
node_ip = ray.util.get_node_ip_address() | ||
hostname = socket.gethostname() | ||
gpu_ids = [str(gpu_id) for gpu_id in ray.get_gpu_ids()] | ||
pid = os.getpid() | ||
|
||
return WorkerMetadata( | ||
node_id=node_id, | ||
node_ip=node_ip, | ||
hostname=hostname, | ||
gpu_ids=gpu_ids, | ||
resource_ids=ray.get_runtime_context().get_resource_ids(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: For consistency with others, define this above in line 89. |
||
pid=pid, | ||
) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One thing I am thinking about is unifying the logic between GPUs and the other resources, which might be simpler since that's how it's set up in the WorkerGroup now, but this does not need to be done in this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I intentionally kept separate for now.