From eaa75fa6707c8781834d4cbc8c356bc2c2dbce6e Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> Date: Wed, 27 Jul 2022 22:28:38 -0700 Subject: [PATCH] [Autoscaler] Match autoscaler image to Ray head image for Ray >= 2.0.0 (#423) * Implement the logic. Signed-off-by: Dmitri Gekhtman * Fix function call. Signed-off-by: Dmitri Gekhtman * Test. Signed-off-by: Dmitri Gekhtman * Update example. Signed-off-by: Dmitri Gekhtman * lowercase Signed-off-by: Dmitri Gekhtman * lint Signed-off-by: Dmitri Gekhtman * wording Signed-off-by: Dmitri Gekhtman --- .../samples/ray-cluster.autoscaler.yaml | 4 +- .../controllers/ray/common/constant.go | 3 + ray-operator/controllers/ray/common/pod.go | 58 ++++++++++++++++--- .../controllers/ray/common/pod_test.go | 35 +++++++++-- 4 files changed, 85 insertions(+), 15 deletions(-) diff --git a/ray-operator/config/samples/ray-cluster.autoscaler.yaml b/ray-operator/config/samples/ray-cluster.autoscaler.yaml index fae7c4b6b5..2c0d58776a 100644 --- a/ray-operator/config/samples/ray-cluster.autoscaler.yaml +++ b/ray-operator/config/samples/ray-cluster.autoscaler.yaml @@ -28,7 +28,9 @@ spec: # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. idleTimeoutSeconds: 60 # image optionally overrides the autoscaler's container image. - image: "rayproject/ray:0860dd" + # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as + # the ray container by default. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. + ## image: "my-repo/my-custom-autoscaler-image:tag" # imagePullPolicy optionally overrides the autoscaler container's image pull policy. imagePullPolicy: Always # resources specifies optional resource request and limit overrides for the autoscaler container. diff --git a/ray-operator/controllers/ray/common/constant.go b/ray-operator/controllers/ray/common/constant.go index a5d457988c..7b7a2de40a 100644 --- a/ray-operator/controllers/ray/common/constant.go +++ b/ray-operator/controllers/ray/common/constant.go @@ -92,6 +92,9 @@ const ( // Ray health check related configurations RayAgentRayletHealthPath = "api/local_raylet_healthz" RayDashboardGCSHealthPath = "api/gcs_healthz" + + // Default autoscaler image when running Ray at versions older than 2.0.0 + FallbackDefaultAutoscalerImage = "rayproject/ray:2.0.0" ) type ServiceType string diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 609b9c623f..180aec0b1e 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -111,8 +111,12 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a // set custom service account with proper roles bound. podTemplate.Spec.ServiceAccountName = utils.GetHeadGroupServiceAccountName(&instance) + rayContainerIndex := getRayContainerIndex(podTemplate.Spec) + rayHeadImage := podTemplate.Spec.Containers[rayContainerIndex].Image + // Determine the default image to use for the Ray container. + autoscalerImage := getAutoscalerImage(rayHeadImage, instance.Spec.RayVersion) // inject autoscaler container into head pod - autoscalerContainer := BuildAutoscalerContainer() + autoscalerContainer := BuildAutoscalerContainer(autoscalerImage) // Merge the user overrides from autoscalerOptions into the autoscaler container config. mergeAutoscalerOverrides(&autoscalerContainer, instance.Spec.AutoscalerOptions) podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, autoscalerContainer) @@ -139,6 +143,44 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a return podTemplate } +// getAutoscalerImage determines the default autoscaler image +func getAutoscalerImage(rayHeadImage string, rayVersion string) string { + if autoscalerSupportIsStable(rayVersion) { + // For Ray versions >= 2.0.0, use the Ray head's image to run the autoscaler. + return rayHeadImage + } else { + // For older Ray versions, use the Ray 2.0.0 image to run the autoscaler. + return FallbackDefaultAutoscalerImage + } +} + +// Determine if autoscaler support is stable in the given rayVersion. +// Return false exactly when the major version is successfully parsed and less than 2. +// Example rayVersion inputs that return true: "2.0.0", "2.0", "2", "2.0.0rc1", "nightly", "latest", "unknown". +// Example inputs that return false: "1.13.0", "1.12", "1". +func autoscalerSupportIsStable(rayVersion string) bool { + // Try to determine major version by extracting everything that comes before the first "." + firstDotIndex := strings.Index(rayVersion, ".") + var majorVersionString string + if firstDotIndex == -1 { + // If there is no ".", try parsing the entire rayVersion as the major version. + majorVersionString = rayVersion + } else { + // Everything up to the first "." + majorVersionString = rayVersion[:firstDotIndex] + } + + if majorVersion, err := strconv.Atoi(majorVersionString); err == nil { + return majorVersion >= 2 + } else { + // If in doubt, just assume that the Ray version is >= 2.0.0, + // so that we use the Ray image to run the autoscaler. + // Currently, there is a lot of "doubt," since the version string is not validated. + // Users can always override the operator's choice of image with autoscalerOptions.image. + return true + } +} + // DefaultWorkerPodTemplate sets the config values func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayiov1alpha1.WorkerGroupSpec, podName string, svcName string, headPort string) v1.PodTemplateSpec { podTemplate := workerSpec.Template @@ -234,7 +276,7 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN ObjectMeta: podTemplateSpec.ObjectMeta, Spec: podTemplateSpec.Spec, } - rayContainerIndex := getRayContainerIndex(pod) + rayContainerIndex := getRayContainerIndex(pod.Spec) // Add /dev/shm volumeMount for the object store to avoid performance degradation. addEmptyDir(&pod.Spec.Containers[rayContainerIndex], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory) @@ -327,12 +369,10 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN } // BuildAutoscalerContainer builds a Ray autoscaler container which can be appended to the head pod. -func BuildAutoscalerContainer() v1.Container { +func BuildAutoscalerContainer(autoscalerImage string) v1.Container { container := v1.Container{ - Name: AutoscalerContainerName, - // TODO: choose right version based on instance.spec.Version - // The currently used image reflects the latest changes from Ray master. - Image: "rayproject/ray:0860dd", + Name: AutoscalerContainerName, + Image: autoscalerImage, ImagePullPolicy: v1.PullAlways, Env: []v1.EnvVar{ { @@ -412,11 +452,11 @@ func convertCmdToString(cmdArr []string) (cmd string) { return cmdAggr.String() } -func getRayContainerIndex(pod v1.Pod) (rayContainerIndex int) { +func getRayContainerIndex(podSpec v1.PodSpec) (rayContainerIndex int) { // a ray pod can have multiple containers. // we identify the ray container based on env var: RAY=true // if the env var is missing, we choose containers[0]. - for i, container := range pod.Spec.Containers { + for i, container := range podSpec.Containers { for _, env := range container.Env { if env.Name == strings.ToLower("ray") && env.Value == strings.ToLower("true") { log.Info("Head pod container with index " + strconv.Itoa(i) + " identified as Ray container based on env RAY=true.") diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index e8e9724156..c4a51332c2 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -27,7 +27,7 @@ var instance = rayiov1alpha1.RayCluster{ Namespace: "default", }, Spec: rayiov1alpha1.RayClusterSpec{ - RayVersion: "12.0.1", + RayVersion: "2.0.0", HeadGroupSpec: rayiov1alpha1.HeadGroupSpec{ ServiceType: "ClusterIP", Replicas: pointer.Int32Ptr(1), @@ -51,7 +51,7 @@ var instance = rayiov1alpha1.RayCluster{ Containers: []v1.Container{ { Name: "ray-head", - Image: "rayproject/ray:12.0.1", + Image: "repo/image:custom", Env: []v1.EnvVar{ { Name: "MY_POD_IP", @@ -101,7 +101,7 @@ var instance = rayiov1alpha1.RayCluster{ Containers: []v1.Container{ { Name: "ray-worker", - Image: "rayproject/autoscaler", + Image: "repo/image:custom", Resources: v1.ResourceRequirements{ Limits: v1.ResourceList{ "nvidia.com/gpu": resource.MustParse("3"), @@ -181,7 +181,7 @@ var volumeMountsWithAutoscaler = []v1.VolumeMount{ var autoscalerContainer = v1.Container{ Name: "autoscaler", - Image: "rayproject/ray:0860dd", + Image: "repo/image:custom", ImagePullPolicy: v1.PullAlways, Env: []v1.EnvVar{ { @@ -231,6 +231,31 @@ var autoscalerContainer = v1.Container{ var trueFlag = true +func TestGetAutoscalerImage(t *testing.T) { + // rayVersion strings for which we judge autoscaler support is stable and thus + // use the same image for the autoscaler as for the Ray container. + newRayVersions := []string{"2.0.0", "2.0.0rc0", "2.0", "2", "latest", "nightly", "what's this"} + rayImage := "repo/image:tag" + for _, rayVersion := range newRayVersions { + expectedAutoscalerImage := rayImage + actualAutoscalerImage := getAutoscalerImage(rayImage, rayVersion) + if actualAutoscalerImage != expectedAutoscalerImage { + t.Fatalf("Expected `%v` but got `%v`", expectedAutoscalerImage, actualAutoscalerImage) + } + } + + // rayVersion strings for which we judge autoscaler support is not stable and thus + // use the default Ray 2.0.0 image to run the autoscaler. + oldRayVersions := []string{"1", "1.13", "1.13.0"} + for _, rayVersion := range oldRayVersions { + expectedAutoscalerImage := "rayproject/ray:2.0.0" + actualAutoscalerImage := getAutoscalerImage(rayImage, rayVersion) + if actualAutoscalerImage != expectedAutoscalerImage { + t.Fatalf("Expected `%v` but got `%v`", expectedAutoscalerImage, actualAutoscalerImage) + } + } +} + func TestGetHeadPort(t *testing.T) { headStartParams := make(map[string]string) actualResult := GetHeadPort(headStartParams) @@ -242,7 +267,7 @@ func TestGetHeadPort(t *testing.T) { headStartParams["port"] = "9999" actualResult = GetHeadPort(headStartParams) expectedResult = "9999" - if !(actualResult == expectedResult) { + if actualResult != expectedResult { t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult) } }