silogen · AVSuni · Feb 5, 2025 · Feb 4, 2025
diff --git a/.docker/ray-rocm/Dockerfile b/.docker/ray-rocm/Dockerfile
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 WORKDIR /
 
-RUN sudo useradd -m -s /bin/bash kaiwo && \
+RUN sudo useradd -m -u 1000 -s /bin/bash kaiwo && \
     sudo usermod -a -G render,video kaiwo && \
     chown -R kaiwo:kaiwo /home/kaiwo && \
     chmod 700 /home/kaiwo && \
@@ -36,7 +36,8 @@ USER kaiwo
 WORKDIR /workload
 
 ENV VLLM_USE_TRITON_FLASH_ATTN=0
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=0
 ENV TORCH_NCCL_HIGH_PRIORITY="1"
 # Use only when absolutely necessary (may cause performance degradation)
 ENV NCCL_DISABLE_P2P=0
-ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
diff --git a/README.md b/README.md
@@ -265,25 +265,13 @@ While Kaiwo's primary purpose is to deploy workloads, it can also be used as a l
 
 Run the commands with the `--help` flag to see all the available options.
 
-* `kaiwo list [<workload type>/[<workload name>]]` lets you browse all available workloads and interact with them
+* `kaiwo manage -n yournamespace` lets you browse all available workloads and interact with them
   * By default, only workloads that have been created by you are shown. This is inferred by your [username](#usernames)
   * If you want to list workloads by all users, use the `--all-users` flag
   * If you want to specify another user, use the `--user=...` flag
-* `kaiwo logs <workload type>/<workload name>` lets you view logs of a particular workload's container
-* `kaiwo exec <workload type>/<workload name> --command ...` lets you run a command in a workload's container
-* `kaiwo monitor <workload type>/<workload name>` lets you run a GPU monitoring command in a workload's GPU container
 
-Ensure that you provide the correct namespace for all commands via the `-n` or `--namespace` flags.
-
-### list command
-
-The `kaiwo list` command can be used as an easy entrypoint to view existing resources. 
 
-### logs command
-
-### exec command
-
-### monitor command
+Ensure that you provide the correct namespace for all commands via the `-n` or `--namespace` flags.
 
 ## Contributing to Kaiwo
 

@@ -67,7 +67,7 @@ func GetExecFlags() workloads.ExecFlags {
 
 const (
 	defaultNamespace = "kaiwo"
-	defaultImage     = "ghcr.io/silogen/rocm-ray:v0.6"
+	defaultImage     = "ghcr.io/silogen/rocm-ray:v0.7"
 )
 
 var (

@@ -15,6 +15,10 @@ spec:
       labels:
         app: {{ .Meta.Name }}
     spec:
+      securityContext:
+        runAsUser: 1000
+        runAsGroup: 1000
+        fsGroup: 1000
       restartPolicy: Always
       {{- if .Meta.ImagePullSecret }}
       imagePullSecrets:

@@ -10,6 +10,10 @@ spec:
   ttlSecondsAfterFinished: 3600
   template:
     spec:
+      securityContext:
+        runAsUser: 1000
+        runAsGroup: 1000
+        fsGroup: 1000
       restartPolicy: "Never"
       {{- if .Meta.ImagePullSecret }}
       imagePullSecrets:

@@ -15,6 +15,10 @@ spec:
         dashboard-host: "0.0.0.0"
       template:
         spec:
+          securityContext:
+            runAsUser: 1000
+            runAsGroup: 1000
+            fsGroup: 1000
           {{- if .Meta.ImagePullSecret }}
           imagePullSecrets:
           - name: {{ .Meta.ImagePullSecret }}
@@ -116,6 +120,10 @@ spec:
         rayStartParams: {}
         template:
           spec:
+            securityContext:
+              runAsUser: 1000
+              runAsGroup: 1000
+              fsGroup: 1000
             {{- if .Meta.ImagePullSecret }}
             imagePullSecrets:
             - name: {{ .Meta.ImagePullSecret }}

@@ -10,11 +10,11 @@ To run this workload on 16 GPUs in `kaiwo` namespace, you can let Kaiwo automati
 
 Run with:
 
-`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ -g 16 --ray`
+`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 Or set these variables yourself with the following command:
 
-`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ --replicas 2 --gpus-per-replica 8 --ray`
+`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ --replicas 2 --gpus-per-replica 8 --ray --storage=100Gi,nameofyourstorageclass`
 
 ### Dependencies
 - Secret `hf-token`: Hugging Face API token for model download

@@ -12,11 +12,11 @@ To run this workload on 16 GPUs in `kaiwo` namespace, you can let Kaiwo automati
 
 Run with:
 
-`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode -g 16 --ray`
+`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 Or set these variables yourself with the following command:
 
-`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode --replicas 2 --gpus-per-replica 8 --ray`
+`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode --replicas 2 --gpus-per-replica 8 --ray --storage=100Gi,nameofyourstorageclass`
 
 ## Dependencies
 - Secret `hf-token`: Hugging Face API token for model download
@@ -11,7 +11,7 @@ Note! this workload expects existing secrets. Have a look at `env` file for the
 
 To run this workload on 16 GPUs in `kaiwo` namespace, set `num_devices` in `entrypoint` to `16` and use the following command:
 
-`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode -g 16 --ray`
+`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 ## Dependencies
 - hf-token: Hugging Face API token for model download

@@ -2,16 +2,17 @@
 
 ## Overview
 
-Note! this workload expects existing secrets. Have a look at `env` file for the expected secrets. If you find both S3 and GCS secrets, you can choose to use either one. Remember to refactor your code accordingly.
+This workload acts as a finetuning overlay over pre-training workload `workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode`.
 
+Keep in mind the following:
 - LORA finetuning: if you use a different model architecture, you may need to adjust LORA configuration and `target_modules` in particular.
 - Supports single-node and multi-node scenarios
 - DeepSpeed Zero stage 3 partitions LLM parameters, gradients, and optimizer states across multiple GPUs
 - set `num_devices` to total number of GPUs.
 
 To run this workload on 16 GPUs in `kaiwo` namespace, set `num_devices` in `entrypoint` to `16` and use the following command:
 
-`kaiwo submit -p workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode -g 16 --ray`
+`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode --overlay-path workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass`
 
 ## Dependencies
 - hf-token: Hugging Face API token for model download
-Original file line number
+Diff line change
@@ Expand Up / @@ -67,7 +67,7 @@ func GetExecFlags() workloads.ExecFlags { @@
     const (
     	defaultNamespace = "kaiwo"
-    	defaultImage     = "ghcr.io/silogen/rocm-ray:v0.6"
+    	defaultImage     = "ghcr.io/silogen/rocm-ray:v0.7"
     )
     var (
@@ Expand Down @@