awslabs · vara-bonthu · Jan 24, 2024 · Oct 31, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf
@@ -15,6 +15,7 @@ module "eks" {
   # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
   subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
   substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
+
 
   manage_aws_auth_configmap = true
   aws_auth_roles = [
@@ -135,8 +136,7 @@ module "eks" {
       # The code filters the private subnets based on their CIDR blocks and selects the subnet ID if the CIDR block starts with "100." Otherwise, it assigns a null value.
       # The element(compact([...]), 0) expression ensures that only the first non-null value is included in the resulting list of subnet IDs.
       subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)
-      ]
+        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)]
 
       # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2
       # ami_id   = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type
@@ -176,9 +176,9 @@ module "eks" {
         echo "Bootstrap complete. Ready to Go!"
       EOT
 
-      min_size     = 0
-      max_size     = 2
-      desired_size = 0
+      min_size     = var.trn1_32xl_min_size
+      max_size     = 4
+      desired_size = var.trn1_32xl_desired_size
 
       # EFA Network Interfaces configuration for Trn1.32xlarge
       network_interfaces = [
@@ -322,9 +322,9 @@ module "eks" {
         echo "Bootstrap complete. Ready to Go!"
       EOT
 
-      min_size     = 0
-      max_size     = 1
-      desired_size = 0
+      min_size     = var.trn1n_32xl_min_size
+      max_size     = 2
+      desired_size = var.trn1n_32xl_desired_size
 
       # EFA Network Interfaces configuration for Trn1.32xlarge
       network_interfaces = [
@@ -507,9 +507,9 @@ module "eks" {
         export PATH=/opt/aws/neuron/bin:$PATH
       EOT
 
-      min_size     = 0
+      min_size     = var.inf2_24xl_min_size
       max_size     = 2
-      desired_size = 0
+      desired_size = var.inf2_24xl_desired_size
 
       labels = {
         instance-type = "inf2"
@@ -555,9 +555,9 @@ module "eks" {
         export PATH=/opt/aws/neuron/bin:$PATH
       EOT
 
-      min_size     = 0
+      min_size     = var.inf2_48xl_min_size
       max_size     = 2
-      desired_size = 0
+      desired_size = var.inf2_48xl_desired_size
 
       labels = {
         instance-type = "inf2-48xl"

diff --git a/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh b/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Check that we are running on an x86_64 instance to avoid issues with docker build
+arch=$(uname -m)
+if [[ ! "$arch" = "x86_64" ]]; then
+  echo "Error: please run this script on an x86_64-based instance"
+  exit 1
+fi
+
+# Check if docker is installed
+junk=$(which docker 2>&1 > /dev/null)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please install docker and try again. ex: for AL2023 you can run:"
+  echo "  sudo yum install docker -y"
+  echo "  sudo systemctl start docker"
+  echo "  sudo usermod -aG docker ec2-user"
+  echo "  newgrp docker"
+  exit 1
+fi
+
+# Check that AWS CLI is installed and configured
+junk=$(aws sts get-caller-identity)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please make sure that the AWS CLI is installed and configured using 'aws configure'."
+  exit 1
+fi
+
+# Prompt user for desired region
+read -p "Enter the ECR region: " region
+echo $region > .eks_region
+
+# Replace with your desired repository name
+ECR_REPO_NAME="neuronx_nemo"
+
+# Check if the ECR repository exists
+if aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$region" >/dev/null 2>&1; then
+  echo "ECR repository '$ECR_REPO_NAME' already exists."
+
+  # Get the ECR_REPO_URI for the existing repository
+  ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$region" --output text)
+  echo "Repository URL: $ECR_REPO_URI"
+else
+  # Create the ECR repository
+  aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$region"
+
+  # Get the ECR_REPO_URI for the newly created repository
+  ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$region" --output text)
+  echo "ECR repository '$ECR_REPO_NAME' created successfully."
+  echo "Repository URL: $ECR_REPO_URI"
+fi
+
+# Store ECR REPO URI for later use
+echo $ECR_REPO_URI > .ecr_repo_uri
+
+# Login to ECR
+echo -e "\nLogging in to ECR"
+aws ecr get-login-password --region "$region" | docker login --username AWS --password-stdin "$ECR_REPO_URI"
+
+# Build neuronx-nemo-megatron docker image
+echo -e "\nBuilding neuronx-nemo-megatron docker image"
+docker build ./docker -f ./docker/Dockerfile.llama_pretrain -t $ECR_REPO_URI
+
+# Push image to ECR
+echo -e "\nPushing image to ECR"
+docker push $ECR_REPO_URI:latest
diff --git a/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh b/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Check if kubectl is installed
+junk=$(which kubectl 2>&1 > /dev/null)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html"
+  exit 1
+fi
+
+# Check if kubectl is configured
+junk=$(kubectl get nodes)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again"
+  exit 1
+fi
+
+# Read in our ECR REPO URI, created by 1-llama2-neuronx-pretrain-build-image.sh
+ECR_REPO_URI=$(cat .ecr_repo_uri)
+echo -e "Using container image $ECR_REPO_URI:latest"
+
+# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh 
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: cli-cmd-shell
+spec:
+  containers:
+  - name: app
+    image: $ECR_REPO_URI:latest
+    command: ["/bin/sh", "-c"]
+    args: ["while true; do sleep 30; done"]
+    volumeMounts:
+    - name: persistent-storage
+      mountPath: /shared
+    - name: dshm
+      mountPath: /dev/shm
+  volumes:
+  - name: persistent-storage
+    persistentVolumeClaim:
+      claimName: fsx-claim
+  - name: dshm
+    emptyDir:
+      medium: Memory
+  restartPolicy: Never
+EOF
+
+if [[ "$?" -eq 0 ]]; then
+  echo
+  kubectl get pods
+fi
diff --git a/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-compile.sh b/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-compile.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Check if kubectl is installed
+junk=$(which kubectl 2>&1 > /dev/null)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html"
+  exit 1
+fi
+
+# Check if kubectl is configured
+junk=$(kubectl get nodes)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again"
+  exit 1
+fi
+
+# Read in our ECR REPO URI, created by 1-llama2-neuronx-pretrain-build-image.sh
+ECR_REPO_URI=$(cat .ecr_repo_uri)
+echo -e "Using container image $ECR_REPO_URI:latest"
+
+# Launch the llama2-7B pre-compilation pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh 
+sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_compile.yaml | kubectl apply -f -
+
+if [[ "$?" -eq 0 ]]; then
+  echo
+  kubectl get pods
+fi
diff --git a/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh b/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Check if kubectl is installed
+junk=$(which kubectl 2>&1 > /dev/null)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html"
+  exit 1
+fi
+
+# Check if kubectl is configured
+junk=$(kubectl get nodes)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again"
+  exit 1
+fi
+
+# Read in our ECR REPO URI, created by 1-llama2-neuronx-pretrain-build-image.sh
+ECR_REPO_URI=$(cat .ecr_repo_uri)
+echo -e "Using container image $ECR_REPO_URI:latest"
+
+# Launch the llama2-7B training pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh 
+sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_train.yaml | kubectl apply -f -
+
+if [[ "$?" -eq 0 ]]; then
+  echo
+  kubectl get pods
+fi
diff --git a/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh b/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Check if kubectl is installed
+junk=$(which kubectl 2>&1 > /dev/null)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html"
+  exit 1
+fi
+
+# Check if kubectl is configured
+junk=$(kubectl get nodes)
+if [[ "$?" -ne 0 ]]; then
+  echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again"
+  exit 1
+fi
+
+# Determine ECR REPO URI to which we'll push the Tensorboard image
+ECR_REPO_URI=$(cat .ecr_repo_uri):tensorboard
+
+# Generate a random password which will be used for Tensorboard
+PASSWORD=$(head /dev/random|md5sum|head -c12)
+
+# Build and push the Tensorboard image
+echo -e "Building Tensorboard container"
+DOCKER_BUILDKIT=1 docker build --build-arg TB_PASSWORD=$PASSWORD ./docker -f ./docker/Dockerfile.tensorboard -t $ECR_REPO_URI
+echo -e "\nPushing Tensorboard container to $ECR_REPO_URI"
+docker push $ECR_REPO_URI
+
+# Create the Tensorboard deployment
+echo -e "\nCreating Tensorboard pod"
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: Service
+metadata:
+  name: tensorboard-ext-loadbalancer
+spec:
+  ports:
+  - port: 80
+    protocol: TCP
+    targetPort: 80
+  selector:
+    app: tensorboard-app
+  type: LoadBalancer
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app: tensorboard-app
+  name: tensorboard-service
+spec:
+  ports:
+  - name: nginx-proxy
+    port: 80
+    protocol: TCP
+    targetPort: 80
+  - name: tensorboard-server
+    port: 6006
+    protocol: TCP
+    targetPort: 6006
+  selector:
+    app: tensorboard-app
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: tensorboard-app
+  name: tensorboard-deployment
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: tensorboard-app
+  template:
+    metadata:
+      labels:
+        app: tensorboard-app
+    spec:
+      containers:
+      - args:
+        - /usr/local/bin/tensorboard --logdir /shared/nemo_experiments/ --bind_all & /usr/sbin/nginx
+          -g "daemon off;"
+        command:
+        - /bin/sh
+        - -c
+        image: $ECR_REPO_URI
+        imagePullPolicy: Always
+        name: app
+        ports:
+        - containerPort: 80
+          name: http
+        - containerPort: 6006
+          name: tensorboard
+        volumeMounts:
+        - mountPath: /shared
+          name: persistent-storage
+      restartPolicy: Always
+      volumes:
+      - name: persistent-storage
+        persistentVolumeClaim:
+          claimName: fsx-claim
+EOF
+
+# Wait for loadbalancer to be created
+LB_HOST=$(kubectl get service tensorboard-ext-loadbalancer -o json | jq -r ".status.loadBalancer.ingress[0].hostname")
+while [ $LB_HOST = "null" ]
+do
+        sleep 2;
+        LB_HOST=$(kubectl get service tensorboard-ext-loadbalancer -o json | jq -r ".status.loadBalancer.ingress[0].hostname")
+done
+
+# Now wait for loadbalancer to come online
+echo -e "\n\nWaiting for loadbalancer $LB_HOST to come online. This could take 1-2 minutes."
+sleep 5
+STATUS=$(curl -sI $LB_HOST|head -1)
+while [[ ! $STATUS =~ Unauthorized ]]
+do
+	echo -n "."
+	sleep 10 
+	STATUS=$(curl -sI $LB_HOST|head -1)
+done
+
+# Lastly, output the URL
+echo -e "\n\n\nTensorboard URL ==> http://admin:$PASSWORD@$LB_HOST\n\n"
+echo "http://admin:$PASSWORD@$LB_HOST" > tensorboard_url.txt