Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kubectl-plugin] Support specifying number of head GPUs and worker GPUs for Rayjob #2989

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion kubectl-plugin/pkg/cmd/job/job_submit.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,10 @@ type SubmitJobOptions struct {
image string
headCPU string
headMemory string
headGPU string
workerCPU string
workerMemory string
workerGPU string
entryPointCPU float32
entryPointGPU float32
entryPointMemory int
Expand Down Expand Up @@ -94,7 +96,7 @@ var (
kubectl ray job submit --name rayjob-sample --working-dir /path/to/working-dir/ --runtime-env /runtimeEnv.yaml -- python my_script.py

# Generate Ray job with specifications and submit Ray job with runtime Env file and working directory
kubectl ray job submit --name rayjob-sample --ray-version %s --image %s --head-cpu 1 --head-memory 5Gi --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi --runtime-env path/to/runtimeEnv.yaml -- python my_script.py
kubectl ray job submit --name rayjob-sample --ray-version %s --image %s --head-cpu 1 --head-memory 5Gi --head-gpu 1 --worker-replicas 3 --worker-cpu 1 --work-gpu 1 --worker-memory 5Gi --runtime-env path/to/runtimeEnv.yaml -- python my_script.py

# Generate Ray job with specifications and print out the generated RayJob YAML
kubectl ray job submit --dry-run --name rayjob-sample --ray-version %s --image %s --head-cpu 1 --head-memory 5Gi --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi --runtime-env path/to/runtimeEnv.yaml -- python my_script.py
Expand Down Expand Up @@ -153,9 +155,11 @@ func NewJobSubmitCommand(streams genericclioptions.IOStreams) *cobra.Command {
cmd.Flags().StringVar(&options.image, "image", fmt.Sprintf("rayproject/ray:%s", options.rayVersion), "container image to use")
cmd.Flags().StringVar(&options.headCPU, "head-cpu", "2", "number of CPUs in the Ray head")
cmd.Flags().StringVar(&options.headMemory, "head-memory", "4Gi", "amount of memory in the Ray head")
cmd.Flags().StringVar(&options.headGPU, "head-gpu", "0", "number of GPUs in the Ray head")
cmd.Flags().Int32Var(&options.workerReplicas, "worker-replicas", 1, "desired worker group replicas")
cmd.Flags().StringVar(&options.workerCPU, "worker-cpu", "2", "number of CPUs in each worker group replica")
cmd.Flags().StringVar(&options.workerMemory, "worker-memory", "4Gi", "amount of memory in each worker group replica")
cmd.Flags().StringVar(&options.workerGPU, "worker-gpu", "0", "number of GPUs in each worker group replica")
cmd.Flags().BoolVar(&options.dryRun, "dry-run", false, "print the generated YAML instead of creating the cluster. Only works when filename is not provided")

options.configFlags.AddFlags(cmd.Flags())
Expand Down Expand Up @@ -265,8 +269,10 @@ func (options *SubmitJobOptions) Run(ctx context.Context, factory cmdutil.Factor
Image: options.image,
HeadCPU: options.headCPU,
HeadMemory: options.headMemory,
HeadGPU: options.headGPU,
WorkerCPU: options.workerCPU,
WorkerMemory: options.workerMemory,
WorkerGPU: options.workerGPU,
WorkerReplicas: options.workerReplicas,
},
}
Expand Down
46 changes: 20 additions & 26 deletions kubectl-plugin/pkg/util/generation/generation.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,34 +101,28 @@ func (rayClusterSpecObject *RayClusterSpecObject) generateRayClusterSpec() *rayv
corev1.ResourceMemory: resource.MustParse(rayClusterSpecObject.WorkerMemory),
}))))))

// If the HeadGPU resource is set with a value, then proceed with parsing.
if rayClusterSpecObject.HeadGPU != "" {
headGPUResource := resource.MustParse(rayClusterSpecObject.HeadGPU)
if !headGPUResource.IsZero() {
var requests, limits corev1.ResourceList
requests = *rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests
limits = *rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits
requests[corev1.ResourceName(resourceNvidiaGPU)] = headGPUResource
limits[corev1.ResourceName(resourceNvidiaGPU)] = headGPUResource

rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests = &requests
rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits = &limits
}
headGPUResource := resource.MustParse(rayClusterSpecObject.HeadGPU)
if !headGPUResource.IsZero() {
var requests, limits corev1.ResourceList
requests = *rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests
limits = *rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits
requests[corev1.ResourceName(resourceNvidiaGPU)] = headGPUResource
limits[corev1.ResourceName(resourceNvidiaGPU)] = headGPUResource

rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests = &requests
rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Limits = &limits
}

// If the workerGPU resource is set with a value, then proceed with parsing.
if rayClusterSpecObject.WorkerGPU != "" {
workerGPUResource := resource.MustParse(rayClusterSpecObject.WorkerGPU)
if !workerGPUResource.IsZero() {
var requests, limits corev1.ResourceList
requests = *rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests
limits = *rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits
requests[corev1.ResourceName(resourceNvidiaGPU)] = workerGPUResource
limits[corev1.ResourceName(resourceNvidiaGPU)] = workerGPUResource

rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests = &requests
rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits = &limits
}
Comment on lines -104 to -131
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, kubectl ray job submit did not support setting headGPU and workerGPU, which caused a panic. With this addition, the original implementation can now be restored.

workerGPUResource := resource.MustParse(rayClusterSpecObject.WorkerGPU)
if !workerGPUResource.IsZero() {
var requests, limits corev1.ResourceList
requests = *rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests
limits = *rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits
requests[corev1.ResourceName(resourceNvidiaGPU)] = workerGPUResource
limits[corev1.ResourceName(resourceNvidiaGPU)] = workerGPUResource

rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests = &requests
rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits = &limits
}

return rayClusterSpec
Expand Down
1 change: 1 addition & 0 deletions kubectl-plugin/pkg/util/generation/generation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ func TestGenerateRayJobApplyConfig(t *testing.T) {
assert.Equal(t, testRayJobYamlObject.Image, *result.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Image)
assert.Equal(t, resource.MustParse(testRayJobYamlObject.HeadCPU), *result.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Cpu())
assert.Equal(t, resource.MustParse(testRayJobYamlObject.HeadMemory), *result.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Memory())
assert.Equal(t, resource.MustParse(testRayJobYamlObject.HeadGPU), *result.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Name(corev1.ResourceName("nvidia.com/gpu"), resource.DecimalSI))
assert.Equal(t, "default-group", *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName)
assert.Equal(t, testRayJobYamlObject.WorkerReplicas, *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)
assert.Equal(t, resource.MustParse(testRayJobYamlObject.WorkerCPU), *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests.Cpu())
Expand Down
Loading