From 62bbdf87131e472649855729ab93023a2bd46439 Mon Sep 17 00:00:00 2001
From: AVSuni <anttsuni@amd.com>
Date: Wed, 5 Feb 2025 08:27:09 +0200
Subject: [PATCH] small fixes (#93)

---
 .docker/ray-rocm/Dockerfile                      |  5 +++--
 README.md                                        | 16 ++--------------
 pkg/cli/apply/utils.go                           |  2 +-
 pkg/workloads/deployments/deployment.yaml.tmpl   |  4 ++++
 pkg/workloads/jobs/job.yaml.tmpl                 |  4 ++++
 pkg/workloads/ray/deployment.yaml.tmpl           |  8 ++++++++
 .../vllm-batch-single-multinode/README.md        |  4 ++--
 .../vllm-online-single-multinode/README.md       |  4 ++--
 .../full-param-zero3-single-multinode/README.md  |  2 +-
 .../lora-sft-zero3-single-multinode/README.md    |  5 +++--
 10 files changed, 30 insertions(+), 24 deletions(-)
diff --git a/.docker/ray-rocm/Dockerfile b/.docker/ray-rocm/Dockerfile
index a477faa..0254094 100644
--- a/.docker/ray-rocm/Dockerfile
+++ b/.docker/ray-rocm/Dockerfile
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 WORKDIR /
 
-RUN sudo useradd -m -s /bin/bash kaiwo && \
+RUN sudo useradd -m -u 1000 -s /bin/bash kaiwo && \
     sudo usermod -a -G render,video kaiwo && \
     chown -R kaiwo:kaiwo /home/kaiwo && \
     chmod 700 /home/kaiwo && \
@@ -36,7 +36,8 @@ USER kaiwo
 WORKDIR /workload
 
 ENV VLLM_USE_TRITON_FLASH_ATTN=0
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=0
 ENV TORCH_NCCL_HIGH_PRIORITY="1"
 # Use only when absolutely necessary (may cause performance degradation)
 ENV NCCL_DISABLE_P2P=0
-ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
\ No newline at end of file
+ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
diff --git a/README.md b/README.md
index 5030d3a..db2bc76 100644
--- a/README.md
+++ b/README.md
@@ -265,25 +265,13 @@ While Kaiwo's primary purpose is to deploy workloads, it can also be used as a l
 
 Run the commands with the `--help` flag to see all the available options.
 
-* `kaiwo list [<workload type>/[<workload name>]]` lets you browse all available workloads and interact with them
+* `kaiwo manage -n yournamespace` lets you browse all available workloads and interact with them
   * By default, only workloads that have been created by you are shown. This is inferred by your [username](#usernames)
   * If you want to list workloads by all users, use the `--all-users` flag
   * If you want to specify another user, use the `--user=...` flag
-* `kaiwo logs <workload type>/<workload name>` lets you view logs of a particular workload's container
-* `kaiwo exec <workload type>/<workload name> --command ...` lets you run a command in a workload's container
-* `kaiwo monitor <workload type>/<workload name>` lets you run a GPU monitoring command in a workload's GPU container
 
-Ensure that you provide the correct namespace for all commands via the `-n` or `--namespace` flags.
-
-### list command
-
-The `kaiwo list` command can be used as an easy entrypoint to view existing resources. 
 
-### logs command
-
-### exec command
-
-### monitor command
+Ensure that you provide the correct namespace for all commands via the `-n` or `--namespace` flags.
 
 ## Contributing to Kaiwo
 
diff --git a/pkg/cli/apply/utils.go b/pkg/cli/apply/utils.go
index c4079e3..1a416c5 100644
--- a/pkg/cli/apply/utils.go
+++ b/pkg/cli/apply/utils.go
@@ -67,7 +67,7 @@ func GetExecFlags() workloads.ExecFlags {
 
 const (
 	defaultNamespace = "kaiwo"
-	defaultImage     = "ghcr.io/silogen/rocm-ray:v0.6"
+	defaultImage     = "ghcr.io/silogen/rocm-ray:v0.7"
 )
 
 var (
diff --git a/pkg/workloads/deployments/deployment.yaml.tmpl b/pkg/workloads/deployments/deployment.yaml.tmpl
index 9bd3af4..4e833b4 100644
--- a/pkg/workloads/deployments/deployment.yaml.tmpl
+++ b/pkg/workloads/deployments/deployment.yaml.tmpl
@@ -15,6 +15,10 @@ spec:
       labels:
         app: {{ .Meta.Name }}
     spec:
+      securityContext:
+        runAsUser: 1000
+        runAsGroup: 1000
+        fsGroup: 1000
       restartPolicy: Always
       {{- if .Meta.ImagePullSecret }}
       imagePullSecrets:
diff --git a/pkg/workloads/jobs/job.yaml.tmpl b/pkg/workloads/jobs/job.yaml.tmpl
index f503782..4045703 100644
--- a/pkg/workloads/jobs/job.yaml.tmpl
+++ b/pkg/workloads/jobs/job.yaml.tmpl
@@ -10,6 +10,10 @@ spec:
   ttlSecondsAfterFinished: 3600
   template:
     spec:
+      securityContext:
+        runAsUser: 1000
+        runAsGroup: 1000
+        fsGroup: 1000
       restartPolicy: "Never"
       {{- if .Meta.ImagePullSecret }}
       imagePullSecrets:
diff --git a/pkg/workloads/ray/deployment.yaml.tmpl b/pkg/workloads/ray/deployment.yaml.tmpl
index 76a2eca..fed2b9f 100644
--- a/pkg/workloads/ray/deployment.yaml.tmpl
+++ b/pkg/workloads/ray/deployment.yaml.tmpl
@@ -15,6 +15,10 @@ spec:
         dashboard-host: "0.0.0.0"
       template:
         spec:
+          securityContext:
+            runAsUser: 1000
+            runAsGroup: 1000
+            fsGroup: 1000
           {{- if .Meta.ImagePullSecret }}
           imagePullSecrets:
           - name: {{ .Meta.ImagePullSecret }}
@@ -116,6 +120,10 @@ spec:
         rayStartParams: {}
         template:
           spec:
+            securityContext:
+              runAsUser: 1000
+              runAsGroup: 1000
+              fsGroup: 1000
             {{- if .Meta.ImagePullSecret }}
             imagePullSecrets:
             - name: {{ .Meta.ImagePullSecret }}
diff --git a/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md b/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md
index 0b6b2e4..39a8447 100644
--- a/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md
+++ b/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md
@@ -10,11 +10,11 @@ To run this workload on 16 GPUs in `kaiwo` namespace, you can let Kaiwo automati
 
 Run with:
 
-`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ -g 16 --ray`
+`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 Or set these variables yourself with the following command:
 
-`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ --replicas 2 --gpus-per-replica 8 --ray`
+`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ --replicas 2 --gpus-per-replica 8 --ray --storage=100Gi,nameofyourstorageclass`
 
 ### Dependencies
 - Secret `hf-token`: Hugging Face API token for model download
diff --git a/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md b/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md
index 5dacb25..bc770ed 100644
--- a/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md
+++ b/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md
@@ -12,11 +12,11 @@ To run this workload on 16 GPUs in `kaiwo` namespace, you can let Kaiwo automati
 
 Run with:
 
-`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode -g 16 --ray`
+`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 Or set these variables yourself with the following command:
 
-`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode --replicas 2 --gpus-per-replica 8 --ray`
+`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode --replicas 2 --gpus-per-replica 8 --ray --storage=100Gi,nameofyourstorageclass`
 
 ## Dependencies
 - Secret `hf-token`: Hugging Face API token for model download
diff --git a/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md b/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md
index df49d74..0f640f3 100644
--- a/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md
+++ b/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md
@@ -11,7 +11,7 @@ Note! this workload expects existing secrets. Have a look at `env` file for the
 
 To run this workload on 16 GPUs in `kaiwo` namespace, set `num_devices` in `entrypoint` to `16` and use the following command:
 
-`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode -g 16 --ray`
+`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 ## Dependencies
 - hf-token: Hugging Face API token for model download
diff --git a/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md b/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md
index 9afea49..4033575 100644
--- a/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md
+++ b/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md
@@ -2,8 +2,9 @@
 
 ## Overview
 
-Note! this workload expects existing secrets. Have a look at `env` file for the expected secrets. If you find both S3 and GCS secrets, you can choose to use either one. Remember to refactor your code accordingly.
+This workload acts as a finetuning overlay over pre-training workload `workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode`.
 
+Keep in mind the following:
 - LORA finetuning: if you use a different model architecture, you may need to adjust LORA configuration and `target_modules` in particular.
 - Supports single-node and multi-node scenarios
 - DeepSpeed Zero stage 3 partitions LLM parameters, gradients, and optimizer states across multiple GPUs
@@ -11,7 +12,7 @@ Note! this workload expects existing secrets. Have a look at `env` file for the
 
 To run this workload on 16 GPUs in `kaiwo` namespace, set `num_devices` in `entrypoint` to `16` and use the following command:
 
-`kaiwo submit -p workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode -g 16 --ray`
+`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode --overlay-path workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 ## Dependencies
 - hf-token: Hugging Face API token for model download