From 62bbdf87131e472649855729ab93023a2bd46439 Mon Sep 17 00:00:00 2001 From: AVSuni Date: Wed, 5 Feb 2025 08:27:09 +0200 Subject: [PATCH] small fixes (#93) --- .docker/ray-rocm/Dockerfile | 5 +++-- README.md | 16 ++-------------- pkg/cli/apply/utils.go | 2 +- pkg/workloads/deployments/deployment.yaml.tmpl | 4 ++++ pkg/workloads/jobs/job.yaml.tmpl | 4 ++++ pkg/workloads/ray/deployment.yaml.tmpl | 8 ++++++++ .../vllm-batch-single-multinode/README.md | 4 ++-- .../vllm-online-single-multinode/README.md | 4 ++-- .../full-param-zero3-single-multinode/README.md | 2 +- .../lora-sft-zero3-single-multinode/README.md | 5 +++-- 10 files changed, 30 insertions(+), 24 deletions(-) diff --git a/.docker/ray-rocm/Dockerfile b/.docker/ray-rocm/Dockerfile index a477faa..0254094 100644 --- a/.docker/ray-rocm/Dockerfile +++ b/.docker/ray-rocm/Dockerfile @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ WORKDIR / -RUN sudo useradd -m -s /bin/bash kaiwo && \ +RUN sudo useradd -m -u 1000 -s /bin/bash kaiwo && \ sudo usermod -a -G render,video kaiwo && \ chown -R kaiwo:kaiwo /home/kaiwo && \ chmod 700 /home/kaiwo && \ @@ -36,7 +36,8 @@ USER kaiwo WORKDIR /workload ENV VLLM_USE_TRITON_FLASH_ATTN=0 +ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=0 ENV TORCH_NCCL_HIGH_PRIORITY="1" # Use only when absolutely necessary (may cause performance degradation) ENV NCCL_DISABLE_P2P=0 -ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH \ No newline at end of file +ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH diff --git a/README.md b/README.md index 5030d3a..db2bc76 100644 --- a/README.md +++ b/README.md @@ -265,25 +265,13 @@ While Kaiwo's primary purpose is to deploy workloads, it can also be used as a l Run the commands with the `--help` flag to see all the available options. -* `kaiwo list [/[]]` lets you browse all available workloads and interact with them +* `kaiwo manage -n yournamespace` lets you browse all available workloads and interact with them * By default, only workloads that have been created by you are shown. This is inferred by your [username](#usernames) * If you want to list workloads by all users, use the `--all-users` flag * If you want to specify another user, use the `--user=...` flag -* `kaiwo logs /` lets you view logs of a particular workload's container -* `kaiwo exec / --command ...` lets you run a command in a workload's container -* `kaiwo monitor /` lets you run a GPU monitoring command in a workload's GPU container -Ensure that you provide the correct namespace for all commands via the `-n` or `--namespace` flags. - -### list command - -The `kaiwo list` command can be used as an easy entrypoint to view existing resources. -### logs command - -### exec command - -### monitor command +Ensure that you provide the correct namespace for all commands via the `-n` or `--namespace` flags. ## Contributing to Kaiwo diff --git a/pkg/cli/apply/utils.go b/pkg/cli/apply/utils.go index c4079e3..1a416c5 100644 --- a/pkg/cli/apply/utils.go +++ b/pkg/cli/apply/utils.go @@ -67,7 +67,7 @@ func GetExecFlags() workloads.ExecFlags { const ( defaultNamespace = "kaiwo" - defaultImage = "ghcr.io/silogen/rocm-ray:v0.6" + defaultImage = "ghcr.io/silogen/rocm-ray:v0.7" ) var ( diff --git a/pkg/workloads/deployments/deployment.yaml.tmpl b/pkg/workloads/deployments/deployment.yaml.tmpl index 9bd3af4..4e833b4 100644 --- a/pkg/workloads/deployments/deployment.yaml.tmpl +++ b/pkg/workloads/deployments/deployment.yaml.tmpl @@ -15,6 +15,10 @@ spec: labels: app: {{ .Meta.Name }} spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 restartPolicy: Always {{- if .Meta.ImagePullSecret }} imagePullSecrets: diff --git a/pkg/workloads/jobs/job.yaml.tmpl b/pkg/workloads/jobs/job.yaml.tmpl index f503782..4045703 100644 --- a/pkg/workloads/jobs/job.yaml.tmpl +++ b/pkg/workloads/jobs/job.yaml.tmpl @@ -10,6 +10,10 @@ spec: ttlSecondsAfterFinished: 3600 template: spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 restartPolicy: "Never" {{- if .Meta.ImagePullSecret }} imagePullSecrets: diff --git a/pkg/workloads/ray/deployment.yaml.tmpl b/pkg/workloads/ray/deployment.yaml.tmpl index 76a2eca..fed2b9f 100644 --- a/pkg/workloads/ray/deployment.yaml.tmpl +++ b/pkg/workloads/ray/deployment.yaml.tmpl @@ -15,6 +15,10 @@ spec: dashboard-host: "0.0.0.0" template: spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 {{- if .Meta.ImagePullSecret }} imagePullSecrets: - name: {{ .Meta.ImagePullSecret }} @@ -116,6 +120,10 @@ spec: rayStartParams: {} template: spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 {{- if .Meta.ImagePullSecret }} imagePullSecrets: - name: {{ .Meta.ImagePullSecret }} diff --git a/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md b/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md index 0b6b2e4..39a8447 100644 --- a/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md +++ b/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/README.md @@ -10,11 +10,11 @@ To run this workload on 16 GPUs in `kaiwo` namespace, you can let Kaiwo automati Run with: -`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ -g 16 --ray` +`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ -g 16 --ray --storage=100Gi,nameofyourstorageclass` Or set these variables yourself with the following command: -`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ --replicas 2 --gpus-per-replica 8 --ray` +`kaiwo submit -p workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/ --replicas 2 --gpus-per-replica 8 --ray --storage=100Gi,nameofyourstorageclass` ### Dependencies - Secret `hf-token`: Hugging Face API token for model download diff --git a/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md b/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md index 5dacb25..bc770ed 100644 --- a/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md +++ b/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/README.md @@ -12,11 +12,11 @@ To run this workload on 16 GPUs in `kaiwo` namespace, you can let Kaiwo automati Run with: -`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode -g 16 --ray` +`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass` Or set these variables yourself with the following command: -`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode --replicas 2 --gpus-per-replica 8 --ray` +`kaiwo serve -p workloads/inference/LLMs/online-inference/vllm-online-single-multinode --replicas 2 --gpus-per-replica 8 --ray --storage=100Gi,nameofyourstorageclass` ## Dependencies - Secret `hf-token`: Hugging Face API token for model download diff --git a/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md b/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md index df49d74..0f640f3 100644 --- a/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md +++ b/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/README.md @@ -11,7 +11,7 @@ Note! this workload expects existing secrets. Have a look at `env` file for the To run this workload on 16 GPUs in `kaiwo` namespace, set `num_devices` in `entrypoint` to `16` and use the following command: -`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode -g 16 --ray` +`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass` ## Dependencies - hf-token: Hugging Face API token for model download diff --git a/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md b/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md index 9afea49..4033575 100644 --- a/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md +++ b/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/README.md @@ -2,8 +2,9 @@ ## Overview -Note! this workload expects existing secrets. Have a look at `env` file for the expected secrets. If you find both S3 and GCS secrets, you can choose to use either one. Remember to refactor your code accordingly. +This workload acts as a finetuning overlay over pre-training workload `workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode`. +Keep in mind the following: - LORA finetuning: if you use a different model architecture, you may need to adjust LORA configuration and `target_modules` in particular. - Supports single-node and multi-node scenarios - DeepSpeed Zero stage 3 partitions LLM parameters, gradients, and optimizer states across multiple GPUs @@ -11,7 +12,7 @@ Note! this workload expects existing secrets. Have a look at `env` file for the To run this workload on 16 GPUs in `kaiwo` namespace, set `num_devices` in `entrypoint` to `16` and use the following command: -`kaiwo submit -p workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode -g 16 --ray` +`kaiwo submit -p workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode --overlay-path workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode -g 16 --ray --storage=100Gi,nameofyourstorageclass` ## Dependencies - hf-token: Hugging Face API token for model download