diff --git a/3.test_cases/16.pytorch-cpu-ddp/.gitignore b/3.test_cases/16.pytorch-cpu-ddp/.gitignore new file mode 100644 index 00000000..8dec40ab --- /dev/null +++ b/3.test_cases/16.pytorch-cpu-ddp/.gitignore @@ -0,0 +1,3 @@ +Miniconda3-latest* +miniconda3 +pt_cpu \ No newline at end of file diff --git a/3.test_cases/16.pytorch-cpu-ddp/0.crate-conda-env.sh b/3.test_cases/16.pytorch-cpu-ddp/0.crate-conda-env.sh new file mode 100644 index 00000000..818a177a --- /dev/null +++ b/3.test_cases/16.pytorch-cpu-ddp/0.crate-conda-env.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -ex + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +chmod +x Miniconda3-latest-Linux-x86_64.sh +./Miniconda3-latest-Linux-x86_64.sh -b -f -p ./miniconda3 + +source ./miniconda3/bin/activate + +conda create -y -p ./pt_cpu python=3.10 --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forge + +source activate ./pt_cpu/ + +conda install -y pytorch=2.0.1 --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forge + +rm Miniconda3-latest-Linux-x86_64.sh* \ No newline at end of file diff --git a/3.test_cases/16.pytorch-cpu-ddp/1.train.sbatch b/3.test_cases/16.pytorch-cpu-ddp/1.conda-train.sbatch similarity index 88% rename from 3.test_cases/16.pytorch-cpu-ddp/1.train.sbatch rename to 3.test_cases/16.pytorch-cpu-ddp/1.conda-train.sbatch index ae670343..653f45fa 100644 --- a/3.test_cases/16.pytorch-cpu-ddp/1.train.sbatch +++ b/3.test_cases/16.pytorch-cpu-ddp/1.conda-train.sbatch @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=cpu-ddp +#SBATCH --job-name=cpu-ddp-conda #SBATCH --exclusive #SBATCH --wait-all-nodes=1 #SBATCH --nodes 2 @@ -14,7 +14,7 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) echo Node IP: $head_node_ip export LOGLEVEL=INFO -srun /opt/conda/envs/pytorch/bin/torchrun \ +srun ./pt_cpu/bin/torchrun \ --nnodes 2 \ --nproc_per_node 4 \ --rdzv_id $RANDOM \ diff --git a/3.test_cases/16.pytorch-cpu-ddp/2.create-enroot-image.sh b/3.test_cases/16.pytorch-cpu-ddp/2.create-enroot-image.sh new file mode 100644 index 00000000..bcad01f6 --- /dev/null +++ b/3.test_cases/16.pytorch-cpu-ddp/2.create-enroot-image.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -ex + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +# Remove old sqsh file if exists +if [ -f ${ENROOT_IMAGE}.sqsh ] ; then + rm pytorch.sqsh +fi + +enroot import --output pytorch.sqsh docker://pytorch/pytorch \ No newline at end of file diff --git a/3.test_cases/16.pytorch-cpu-ddp/3.container-train.sbatch b/3.test_cases/16.pytorch-cpu-ddp/3.container-train.sbatch new file mode 100644 index 00000000..4109a3c6 --- /dev/null +++ b/3.test_cases/16.pytorch-cpu-ddp/3.container-train.sbatch @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=cpu-ddp-container +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 +#SBATCH --nodes 2 +#SBATCH --cpus-per-task=4 +#SBATCH --output=logs/%x_%j.out # logfile for stdout/stderr + +nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +nodes_array=($nodes) +head_node=${nodes_array[0]} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + +echo Node IP: $head_node_ip +export LOGLEVEL=INFO + +declare -a ARGS=( + --container-image ${PWD}/pytorch.sqsh + --container-mounts ${PWD} +) + +srun -l "${ARGS[@]}" torchrun \ + --nnodes 2 \ + --nproc_per_node 4 \ + --rdzv_id $RANDOM \ + --rdzv_backend c10d \ + --rdzv_endpoint $head_node_ip:29500 \ + ${PWD}/ddp.py 50 10 diff --git a/3.test_cases/16.pytorch-cpu-ddp/README.md b/3.test_cases/16.pytorch-cpu-ddp/README.md index c7c929f2..71271c4a 100644 --- a/3.test_cases/16.pytorch-cpu-ddp/README.md +++ b/3.test_cases/16.pytorch-cpu-ddp/README.md @@ -1,6 +1,11 @@ # PyTorch DDP on CPU -This test case is intended to provide a simple distributed training example on CPU using [PyTorch DDP](https://pytorch.org/tutorials/beginner/ddp_series_theory.html). +Isolated environments are crucial for reproducible machine learning because they encapsulate specific software versions and dependencies, ensuring models are consistently retrainable, shareable, and deployable without compatibility issues. + +[Anaconda](https://www.anaconda.com/) leverages conda environments to create distinct spaces for projects, allowing different Python versions and libraries to coexist without conflicts by isolating updates to their respective environments. [Docker](https://www.docker.com/), a containerization platform, packages applications and their dependencies into containers, ensuring they run seamlessly across any Linux server by providing OS-level virtualization and encapsulating the entire runtime environment. + +This example showcases CPU [PyTorch DDP](https://pytorch.org/tutorials/beginner/ddp_series_theory.html) environment setups utilizing these approaches for efficient environments management. + ## 1. Preparation @@ -8,22 +13,32 @@ This guide assumes that you have the following: * A functional Slurm cluster on AWS, whose compute instances are based on DeepLearning AMI. * An FSx for Lustre filesystem mounted on `/fsx`. +* `enroot` if you want to run the container example. We recommend that you setup a Slurm cluster using the templates in the architectures [directory](../../1.architectures). -## 2. Submit training job +## 2. Submit training job using conda environment + +In this step, you will create PyTorch virtual environment using conda. + +```bash +bash 0.create-conda-env.sh +``` + +It will prepare `miniconda3` and `pt_cpu` `pt_cpu` includes `torchrun` + Submit DDP training job with: ```bash -sbatch 1.train.sbatch +sbatch 1.conda-train.sbatch ``` Output of the training job can be found in `logs` directory: ```bash -# cat logs/cpu-ddp_xxx.out +# cat logs/cpu-ddp-conda_xxx.out Node IP: 10.1.96.108 [2024-03-12 08:22:45,549] torch.distributed.run: [WARNING] master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified. [2024-03-12 08:22:45,549] torch.distributed.run: [WARNING] @@ -58,3 +73,60 @@ Node IP: 10.1.96.108 [2024-03-12 08:22:56,575] torch.distributed.elastic.agent.server.api: [INFO] Done waiting for other agents. Elapsed: 0.0005395412445068359 seconds ``` +## 3. Submit training job using docker container + +In this example, you'll learn how to use the official PyTorch Docker image and execute the container within the Slurm scheduler using Enroot. + +[Enroot](https://github.com/NVIDIA/enroot) uses the same underlying technologies as containers but removes much of the isolation they inherently provide while preserving filesystem separation. This approach is generally preferred in high-performance environments or virtualized environments where portability and reproducibility is important, but extra isolation is not warranted. + +Create Enroot container images: + +```bash +bash 3.container-train.sbatch +``` + +It will pull `pytorch/pytorch` container, then create [squashfs](https://www.kernel.org/doc/Documentation/filesystems/squashfs.txt) image named `pytorch.sqsh`. + +Submit DDP training job using the image with: + +```bash +sbatch 4.container-train.sbatch +``` + +Output of the training job can be found in `logs` directory: + +```bash +# cat logs/cpu-ddp-container.out +Node IP: 10.1.96.108 +[2024-03-12 08:22:45,549] torch.distributed.run: [WARNING] master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified. +[2024-03-12 08:22:45,549] torch.distributed.run: [WARNING] +[2024-03-12 08:22:45,549] torch.distributed.run: [WARNING] ***************************************** +[2024-03-12 08:22:45,549] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-03-12 08:22:45,549] torch.distributed.run: [WARNING] ***************************************** +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] Starting elastic_operator with launch configs: +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] entrypoint : ddp.py +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] min_nodes : 2 +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] max_nodes : 2 +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] nproc_per_node : 4 +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] run_id : 5982 +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] rdzv_backend : c10d +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] rdzv_endpoint : 10.1.96.108:29500 +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] rdzv_configs : {'timeout': 900} +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] max_restarts : 0 +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] monitor_interval : 5 +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] log_dir : None +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] metrics_cfg : {} +[2024-03-12 08:22:45,549] torch.distributed.launcher.api: [INFO] +[2024-03-12 08:22:45,552] torch.distributed.elastic.agent.server.local_elastic_agent: [INFO] log directory set to: /tmp/torchelastic_9g50nxjq/5982_tflt1tcd +[2024-03-12 08:22:45,552] torch.distributed.elastic.agent.server.api: [INFO] [default] starting workers for entrypoint: python +... +[RANK 3] Epoch 49 | Batchsize: 32 | Steps: 8 +[RANK 5] Epoch 49 | Batchsize: 32 | Steps: 8 +[RANK 4] Epoch 49 | Batchsize: 32 | Steps: 8 +[2024-03-12 08:22:56,574] torch.distributed.elastic.agent.server.api: [INFO] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish. +[2024-03-12 08:22:56,574] torch.distributed.elastic.agent.server.api: [INFO] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish +[2024-03-12 08:22:56,575] torch.distributed.elastic.agent.server.api: [INFO] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish. +[2024-03-12 08:22:56,575] torch.distributed.elastic.agent.server.api: [INFO] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish +[2024-03-12 08:22:56,575] torch.distributed.elastic.agent.server.api: [INFO] Done waiting for other agents. Elapsed: 0.0010929107666015625 seconds +[2024-03-12 08:22:56,575] torch.distributed.elastic.agent.server.api: [INFO] Done waiting for other agents. Elapsed: 0.0005395412445068359 seconds +``` \ No newline at end of file