awslabs · sindhupalakodety · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/1-pretrain-trn1-raycluster.yaml b/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/1-pretrain-trn1-raycluster.yaml
@@ -0,0 +1,149 @@
+# This RayCluster configuration deploys a distributed training environment for Llama3
+# using AWS Neuron SDK and RayTrain on Amazon EKS.
+
+# ----------------------------------------------------------------------
+# NOTE: For detailed deployment instructions, refer to the DoEKS website (https://awslabs.github.io/data-on-eks/docs/category/training-on-eks).
+# ----------------------------------------------------------------------
+
+# ----------------------------------------------------------------------
+# NOTE: We are using the default namespace for this deployment because the fsx-claim PVC is created under the default namespace by the Terraform blueprint.
+# If you want to deploy the cluster in a dedicated namespace, ensure that the FSX for Lustre file system is also created in the same namespace since PVCs are namespace-bound.
+# ----------------------------------------------------------------------
+
+# Docs for Volcano with KubeRay: https://docs.ray.io/en/master/cluster/kubernetes/k8s-ecosystem/volcano.html
+---
+apiVersion: scheduling.volcano.sh/v1beta1
+kind: Queue
+metadata:
+  name: llama3-training-queue
+  namespace: default
+spec:
+  weight: 1
+  capability:
+    cpu: '4000'
+    memory: 12000Gi
+
+---
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: kuberay-trn1
+  namespace: default
+  labels:
+    ray.io/scheduler-name: volcano
+    volcano.sh/queue-name: llama3-training-queue
+spec:
+  rayVersion: 2.22.0
+  headGroupSpec:
+    # Head Node Configuration
+    # This section defines the specification for the Ray head pod.
+    # The head node manages the cluster and provides services like the dashboard and GCS.
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: <AWS_ACCOUNT_ID>.dkr.ecr.us-east-2.amazonaws.com/rayptl_pretrain_llama3.1:latest # Replace AWS Account ID
+            imagePullPolicy: Always # Pull the latest image each time
+            lifecycle:
+              preStop:
+                exec:
+                  command: ["/bin/sh", "-c", "ray stop"]  # Graceful shutdown of Ray processes
+            ports:
+              - containerPort: 8265
+                name: dashboard   # Expose Ray dashboard
+              - containerPort: 6379
+                name: redis       # Expose Redis port
+              - containerPort: 10001
+                name: object-manager # Expose object manager port
+            resources:
+              requests:
+                cpu: 6
+                memory: 30Gi
+            volumeMounts:
+              - mountPath: /tmp/ray
+                name: log-volume   # Mount for Ray logs
+              - name: persistent-storage # Mount shared filesystem (FSx for Lustre)
+                mountPath: /shared
+        # Node Selector for Karpenter
+        # Karpenter will provision this head pod on a node with the specified labels.
+        nodeSelector:
+          instanceType: mixed-x86
+          provisionerType: Karpenter
+        volumes:
+          - name: log-volume
+            emptyDir: {}
+          - name: persistent-storage
+            persistentVolumeClaim:
+              claimName: fsx-claim   # Reference the PVC for shared storage
+    rayStartParams:
+      dashboard-host: 0.0.0.0    # Make dashboard accessible
+
+  workerGroupSpecs:
+    # Worker Node Configuration
+    # This section defines the specification for the Ray worker pods.
+    # Worker nodes execute tasks and participate in distributed training.
+    - groupName: workergroup
+      replicas: 16  # Number of worker replicas
+      minReplicas: 16 # Minimum number of worker replicas
+      maxReplicas: 16 # Maximum number of worker replicas (no scaling in this case)
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: <AWS_ACCOUNT_ID>.dkr.ecr.us-east-2.amazonaws.com/rayptl_pretrain_llama3.1:latest # Replace AWS Account ID
+              imagePullPolicy: Always # Pull the latest image each time
+              lifecycle:
+                preStop:
+                  exec:
+                    command: ["/bin/sh", "-c", "ray stop"]
+              ports:
+                - containerPort: 8265
+                  name: dashboard
+                - containerPort: 6379
+                  name: redis
+                - containerPort: 10001
+                  name: object-manager
+              resources:
+                limits:
+                  aws.amazon.com/neuron: '16'  # Request AWS Neuron cores
+                  vpc.amazonaws.com/efa: '8'   # Request AWS EFA devices
+                  memory: 440Gi
+                requests:
+                  aws.amazon.com/neuron: '16'
+                  vpc.amazonaws.com/efa: '8'
+                  cpu: '120'
+                  memory: 440Gi
+              volumeMounts:
+                - name: persistent-storage
+                  mountPath: /shared   # Mount shared filesystem (FSx for Lustre)
+                - name: dshm
+                  mountPath: /dev/shm   # Mount for shared memory
+                - mountPath: /tmp/ray
+                  name: log-volume     # Mount for Ray logs
+          # Node Selector for Managed Node Group (with Cluster Autoscaler)
+          # These workers will run on Trn1 instances provisioned by the cluster autoscaler.
+          # This is necessary as Karpenter doesn't currently support EFA (required for Neuron distributed training).
+          nodeSelector:
+            instance-type: trn1-32xl
+            provisioner: cluster-autoscaler
+
+          # Tolerations for Trn1 and Dedicated Nodes
+          tolerations:
+            - key: "aws.amazon.com/neuron"
+              operator: "Exists"
+              effect: "NoSchedule"
+            - key: "hub.jupyter.org/dedicated"
+              operator: "Equal"
+              value: "user"
+              effect: "NoSchedule"
+          volumes:
+            # Persistent Volume Claim (PVC) to access the FSx for Lustre filesystem
+            - name: persistent-storage
+              persistentVolumeClaim:
+                claimName: fsx-claim
+            - name: dshm
+              emptyDir:
+                medium: Memory
+            - name: log-volume
+              emptyDir: {}
diff --git a/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/2-download_fineweb_dataset.yaml b/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/2-download_fineweb_dataset.yaml
@@ -0,0 +1,34 @@
+# ----------------------------------------------------------------------------
+# RayJob: llama3-generate-pretraining-test-data
+#
+# Description:
+# This RayJob is responsible for generating pre-training test data required for
+# the Llama3 model training. It sources data from the specified dataset, processes
+# it, and prepares it for use in subsequent training stages. The job runs a Python
+# script (`get_dataset.py`) that performs these data preparation steps.
+
+# Usage:
+# Apply this configuration to your Kubernetes cluster using `kubectl apply -f download_fineweb_dataset.yaml`.
+# Ensure that the Ray cluster (`kuberay-trn1`) is running and accessible in the specified namespace.
+# ----------------------------------------------------------------------------
+
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+  name: grammarly-generate-pretraining-test-data
+  namespace: default
+spec:
+  submissionMode: K8sJobMode
+  entrypoint: "mkdir -p /shared/llama3.1_config/ /shared/fineweb_llama3.1_tokenized /shared/checkpoint /shared/tblogs && cp config.json /shared/llama3.1_config/ && python3 get_dataset.py"
+  runtimeEnvYAML: |
+    working_dir: /grammarly_pretraining
+    env_vars:
+      PYTHONUNBUFFERED: '0'
+    resources:
+      requests:
+        cpu: "6"
+        memory: "30Gi"
+  clusterSelector:
+    ray.io/cluster: kuberay-trn1
+    rayClusterNamespace: default  # Replace with the namespace where your RayCluster is deployed
+  ttlSecondsAfterFinished: 60  # Time to live for the pod after completion (in seconds)
diff --git a/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/3-parallel-compile-trn1-rayjob.yaml b/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/3-parallel-compile-trn1-rayjob.yaml
@@ -0,0 +1,28 @@
+# ----------------------------------------------------------------------------
+# RayJob: llama3-pretraining-job
+#
+# Description:
+# This RayJob is responsible for neuron parallel compile step of the Llama3 model. 
+# This step is a pre-requisite for training the language model with the prepared dataset.
+
+# Usage:
+# Apply this configuration to your Kubernetes cluster using `kubectl apply -f 3-parallel-compile-trn1-rayjob.yaml`.
+# Ensure that the Ray cluster (`kuberay-trn1`) is running and accessible in the specified namespace.
+# Uncomment the fields if you want the job to shut down after finishing or if you want to set a maximum runtime.
+# ----------------------------------------------------------------------------
+
+---
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+  name: llama3.1-parallel-compile-job
+spec:
+  submissionMode: K8sJobMode
+  entrypoint: "NEURON_NUM_DEVICES=32 bash run_llama3.1_8b.sh -r 2 -n 16 -l 4e-4 -s 8192 -p 1"
+  runtimeEnvYAML: |
+    working_dir: /grammarly_pretraining
+  clusterSelector:
+    ray.io/cluster: kuberay-trn1
+    rayClusterNamespace: default  # Replace with the namespace where your RayCluster is deployed
+  shutdownAfterJobFinishes: true
+  ttlSecondsAfterFinished: 60  # Time to live for the pod after completion (in seconds)
diff --git a/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/4-train-trn1-rayjob.yaml b/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/4-train-trn1-rayjob.yaml
@@ -0,0 +1,29 @@
+# ----------------------------------------------------------------------------
+# RayJob: llama3-pretraining-job
+#
+# Description:
+# This RayJob is responsible for the main pretraining step of the Llama3 model. It runs a
+# Python script (`ray_train_llama3.py`) to perform the pretraining using AWS Neuron devices.
+# This step is critical for training the language model with the prepared dataset.
+
+# Usage:
+# Apply this configuration to your Kubernetes cluster using `kubectl apply -f 4-grammarly-train-trn1-rayjob.yaml`.
+# Ensure that the Ray cluster (`kuberay-trn1`) is running and accessible in the specified namespace.
+# Uncomment the fields if you want the job to shut down after finishing or if you want to set a maximum runtime.
+# ----------------------------------------------------------------------------
+
+---
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+  name: llama3.1-training-job
+spec:
+  submissionMode: K8sJobMode
+  entrypoint: "NEURON_NUM_DEVICES=32 bash run_llama3.1_8b.sh -w 500 -n 16 -l 4e-4 -s 8192"
+  runtimeEnvYAML: |
+    working_dir: /grammarly_pretraining
+  clusterSelector:
+    ray.io/cluster: kuberay-trn1
+    rayClusterNamespace: default  # Replace with the namespace where your RayCluster is deployed
+  shutdownAfterJobFinishes: true
+  ttlSecondsAfterFinished: 60  # Time to live for the pod after completion (in seconds)
diff --git a/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/Dockerfile b/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/Dockerfile
@@ -0,0 +1,34 @@
+ARG REGION
+
+# Base image: PyTorch training image for NeuronX
+FROM public.ecr.aws/neuron/pytorch-training-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04
+
+# Install Ray for distributed computing
+RUN pip3 install aiohttp \
+    && rm -rf /root/.cache
+
+# Install additional Python dependencies
+RUN pip3 install wget awscli regex boto3 pyarrow \
+    && rm -rf /root/.cache/
+
+# Copy the Llama2 training code into the container
+# (Separate layer to rebuild only if the code changes)
+COPY ./llama3.1_pretraining /llama3.1_pretraining 
+
+# Make shell scripts executable
+RUN chmod +x /llama3.1_pretraining/run_llama3.1_8b.sh
+
+# Set the working directory
+WORKDIR /llama3.1_pretraining
+
+# Installing the requirements 
+RUN pip install -r requirements.txt --extra-index-url https://pip.repos.neuron.amazonaws.com 
+
+# Installing the requirements 
+# RUN pip install transformers==4.32.1 --no-warn-conflicts
+
+# # Installing neuronx-cc 2.0+
+RUN pip install neuronx-cc==2.* --extra-index-url https://pip.repos.neuron.amazonaws.com -U
+
+# Replacing libneuronxla with torch_neuronx for neuron parallel compile in Ray config file
+COPY ./config.py /usr/local/lib/python3.10/site-packages/ray/train/torch/xla/config.py
diff --git a/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/build_docker.sh b/gen-ai/training/ray-ptl-llama3.1-pretrain-trn1/build_docker.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# This script automates the process of building a multi-architecture Docker image
+# and pushing it to an Amazon ECR (Elastic Container Registry) repository.
+# The script performs the following steps:
+# 1. Ensures the script is run on an x86_64-based instance.
+# 2. Checks if Docker is installed and running.
+# 3. Verifies that AWS CLI is installed and configured.
+# 4. Prompts the user for the desired AWS region.
+# 5. Checks if the specified ECR repository exists, and creates it if it does not.
+# 6. Logs into Amazon ECR.
+# 7. Creates a Docker Buildx builder instance to support multi-architecture builds.
+# 8. Builds and pushes a multi-architecture Docker image to the specified ECR repository.
+
+# Note: It is preferable to use AWS Cloud9 IDE with at least 100GB of storage for creating this image
+# to avoid storage issues during the build process. You can use your local Mac or Windows machine,
+# but ensure that you have enough memory and storage allocated for Docker to build this image.
+# For more help, reach out to Perplexity or Google.
+
+# Replace with your desired repository name
+ECR_REPO_NAME="rayptl_pretrain_llama3.1"
+
+# # Check that we are running on an x86_64 instance to avoid issues with docker build
+# arch=$(uname -m)
+# if [[ ! "$arch" = "x86_64" ]]; then
+#   echo "Error: please run this script on an x86_64-based instance"
+#   exit 1
+# fi
+
+# Check if docker is installed
+junk=$(which docker 2>&1 > /dev/null)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please install docker and try again. ex: for AL2023 you can run:"
+  echo "  sudo yum install docker -y"
+  echo "  sudo systemctl start docker"
+  echo "  sudo usermod -aG docker ec2-user"
+  echo "  newgrp docker"
+  exit 1
+fi
+
+# Check that AWS CLI is installed and configured
+junk=$(aws sts get-caller-identity)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please make sure that the AWS CLI is installed and configured using 'aws configure'."
+  exit 1
+fi
+
+# Prompt user for desired region
+read -p "Enter the ECR region (ex: us-east-2): " region
+echo $region > .eks_region
+
+# Check if the ECR repository exists
+if aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$region" >/dev/null 2>&1; then
+  echo "ECR repository '$ECR_REPO_NAME' already exists."
+
+  # Get the ECR_REPO_URI for the existing repository
+  ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$region" --output text)
+  echo "Repository URL: $ECR_REPO_URI"
+else
+  # Create the ECR repository
+  aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$region"
+
+  # Get the ECR_REPO_URI for the newly created repository
+  ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$region" --output text)
+  echo "ECR repository '$ECR_REPO_NAME' created successfully."
+  echo "Repository URL: $ECR_REPO_URI"
+fi
+
+# Store ECR REPO URI for later use
+echo $ECR_REPO_URI > .ecr_repo_uri
+
+# Login to ECR
+echo -e "\nLogging in to ECR"
+aws ecr get-login-password --region "$region" | docker login --username AWS --password-stdin $ECR_REPO_URI
+aws ecr get-login-password --region "$region" | docker login --username AWS --password-stdin 763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training-neuronx
+
+# Create and use a new builder instance for multi-arch builds
+docker buildx create --use --name mybuilder --driver docker-container
+docker buildx inspect mybuilder --bootstrap
+
+echo -e "\nBuilding kuberay_trn1 docker image" \
+  && docker buildx build --platform linux/amd64 -t $ECR_REPO_URI:latest --build-arg REGION=$region . --push \
+  && echo -e "\nImage successfully pushed to ECR"