Skip to content

Commit

Permalink
[Autoscaler] Match autoscaler image to Ray head image for Ray >= 2.0.0 (
Browse files Browse the repository at this point in the history
#423)

* Implement the logic.

Signed-off-by: Dmitri Gekhtman <[email protected]>

* Fix function call.

Signed-off-by: Dmitri Gekhtman <[email protected]>

* Test.

Signed-off-by: Dmitri Gekhtman <[email protected]>

* Update example.

Signed-off-by: Dmitri Gekhtman <[email protected]>

* lowercase

Signed-off-by: Dmitri Gekhtman <[email protected]>

* lint

Signed-off-by: Dmitri Gekhtman <[email protected]>

* wording

Signed-off-by: Dmitri Gekhtman <[email protected]>
  • Loading branch information
DmitriGekhtman committed Jul 28, 2022
1 parent f3415f4 commit eaa75fa
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 15 deletions.
4 changes: 3 additions & 1 deletion ray-operator/config/samples/ray-cluster.autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ spec:
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
idleTimeoutSeconds: 60
# image optionally overrides the autoscaler's container image.
image: "rayproject/ray:0860dd"
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
# the ray container by default. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
## image: "my-repo/my-custom-autoscaler-image:tag"
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
imagePullPolicy: Always
# resources specifies optional resource request and limit overrides for the autoscaler container.
Expand Down
3 changes: 3 additions & 0 deletions ray-operator/controllers/ray/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ const (
// Ray health check related configurations
RayAgentRayletHealthPath = "api/local_raylet_healthz"
RayDashboardGCSHealthPath = "api/gcs_healthz"

// Default autoscaler image when running Ray at versions older than 2.0.0
FallbackDefaultAutoscalerImage = "rayproject/ray:2.0.0"
)

type ServiceType string
Expand Down
58 changes: 49 additions & 9 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,12 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a
// set custom service account with proper roles bound.
podTemplate.Spec.ServiceAccountName = utils.GetHeadGroupServiceAccountName(&instance)

rayContainerIndex := getRayContainerIndex(podTemplate.Spec)
rayHeadImage := podTemplate.Spec.Containers[rayContainerIndex].Image
// Determine the default image to use for the Ray container.
autoscalerImage := getAutoscalerImage(rayHeadImage, instance.Spec.RayVersion)
// inject autoscaler container into head pod
autoscalerContainer := BuildAutoscalerContainer()
autoscalerContainer := BuildAutoscalerContainer(autoscalerImage)
// Merge the user overrides from autoscalerOptions into the autoscaler container config.
mergeAutoscalerOverrides(&autoscalerContainer, instance.Spec.AutoscalerOptions)
podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, autoscalerContainer)
Expand All @@ -139,6 +143,44 @@ func DefaultHeadPodTemplate(instance rayiov1alpha1.RayCluster, headSpec rayiov1a
return podTemplate
}

// getAutoscalerImage determines the default autoscaler image
func getAutoscalerImage(rayHeadImage string, rayVersion string) string {
if autoscalerSupportIsStable(rayVersion) {
// For Ray versions >= 2.0.0, use the Ray head's image to run the autoscaler.
return rayHeadImage
} else {
// For older Ray versions, use the Ray 2.0.0 image to run the autoscaler.
return FallbackDefaultAutoscalerImage
}
}

// Determine if autoscaler support is stable in the given rayVersion.
// Return false exactly when the major version is successfully parsed and less than 2.
// Example rayVersion inputs that return true: "2.0.0", "2.0", "2", "2.0.0rc1", "nightly", "latest", "unknown".
// Example inputs that return false: "1.13.0", "1.12", "1".
func autoscalerSupportIsStable(rayVersion string) bool {
// Try to determine major version by extracting everything that comes before the first "."
firstDotIndex := strings.Index(rayVersion, ".")
var majorVersionString string
if firstDotIndex == -1 {
// If there is no ".", try parsing the entire rayVersion as the major version.
majorVersionString = rayVersion
} else {
// Everything up to the first "."
majorVersionString = rayVersion[:firstDotIndex]
}

if majorVersion, err := strconv.Atoi(majorVersionString); err == nil {
return majorVersion >= 2
} else {
// If in doubt, just assume that the Ray version is >= 2.0.0,
// so that we use the Ray image to run the autoscaler.
// Currently, there is a lot of "doubt," since the version string is not validated.
// Users can always override the operator's choice of image with autoscalerOptions.image.
return true
}
}

// DefaultWorkerPodTemplate sets the config values
func DefaultWorkerPodTemplate(instance rayiov1alpha1.RayCluster, workerSpec rayiov1alpha1.WorkerGroupSpec, podName string, svcName string, headPort string) v1.PodTemplateSpec {
podTemplate := workerSpec.Template
Expand Down Expand Up @@ -234,7 +276,7 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN
ObjectMeta: podTemplateSpec.ObjectMeta,
Spec: podTemplateSpec.Spec,
}
rayContainerIndex := getRayContainerIndex(pod)
rayContainerIndex := getRayContainerIndex(pod.Spec)

// Add /dev/shm volumeMount for the object store to avoid performance degradation.
addEmptyDir(&pod.Spec.Containers[rayContainerIndex], &pod, SharedMemoryVolumeName, SharedMemoryVolumeMountPath, v1.StorageMediumMemory)
Expand Down Expand Up @@ -327,12 +369,10 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN
}

// BuildAutoscalerContainer builds a Ray autoscaler container which can be appended to the head pod.
func BuildAutoscalerContainer() v1.Container {
func BuildAutoscalerContainer(autoscalerImage string) v1.Container {
container := v1.Container{
Name: AutoscalerContainerName,
// TODO: choose right version based on instance.spec.Version
// The currently used image reflects the latest changes from Ray master.
Image: "rayproject/ray:0860dd",
Name: AutoscalerContainerName,
Image: autoscalerImage,
ImagePullPolicy: v1.PullAlways,
Env: []v1.EnvVar{
{
Expand Down Expand Up @@ -412,11 +452,11 @@ func convertCmdToString(cmdArr []string) (cmd string) {
return cmdAggr.String()
}

func getRayContainerIndex(pod v1.Pod) (rayContainerIndex int) {
func getRayContainerIndex(podSpec v1.PodSpec) (rayContainerIndex int) {
// a ray pod can have multiple containers.
// we identify the ray container based on env var: RAY=true
// if the env var is missing, we choose containers[0].
for i, container := range pod.Spec.Containers {
for i, container := range podSpec.Containers {
for _, env := range container.Env {
if env.Name == strings.ToLower("ray") && env.Value == strings.ToLower("true") {
log.Info("Head pod container with index " + strconv.Itoa(i) + " identified as Ray container based on env RAY=true.")
Expand Down
35 changes: 30 additions & 5 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ var instance = rayiov1alpha1.RayCluster{
Namespace: "default",
},
Spec: rayiov1alpha1.RayClusterSpec{
RayVersion: "12.0.1",
RayVersion: "2.0.0",
HeadGroupSpec: rayiov1alpha1.HeadGroupSpec{
ServiceType: "ClusterIP",
Replicas: pointer.Int32Ptr(1),
Expand All @@ -51,7 +51,7 @@ var instance = rayiov1alpha1.RayCluster{
Containers: []v1.Container{
{
Name: "ray-head",
Image: "rayproject/ray:12.0.1",
Image: "repo/image:custom",
Env: []v1.EnvVar{
{
Name: "MY_POD_IP",
Expand Down Expand Up @@ -101,7 +101,7 @@ var instance = rayiov1alpha1.RayCluster{
Containers: []v1.Container{
{
Name: "ray-worker",
Image: "rayproject/autoscaler",
Image: "repo/image:custom",
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
"nvidia.com/gpu": resource.MustParse("3"),
Expand Down Expand Up @@ -181,7 +181,7 @@ var volumeMountsWithAutoscaler = []v1.VolumeMount{

var autoscalerContainer = v1.Container{
Name: "autoscaler",
Image: "rayproject/ray:0860dd",
Image: "repo/image:custom",
ImagePullPolicy: v1.PullAlways,
Env: []v1.EnvVar{
{
Expand Down Expand Up @@ -231,6 +231,31 @@ var autoscalerContainer = v1.Container{

var trueFlag = true

func TestGetAutoscalerImage(t *testing.T) {
// rayVersion strings for which we judge autoscaler support is stable and thus
// use the same image for the autoscaler as for the Ray container.
newRayVersions := []string{"2.0.0", "2.0.0rc0", "2.0", "2", "latest", "nightly", "what's this"}
rayImage := "repo/image:tag"
for _, rayVersion := range newRayVersions {
expectedAutoscalerImage := rayImage
actualAutoscalerImage := getAutoscalerImage(rayImage, rayVersion)
if actualAutoscalerImage != expectedAutoscalerImage {
t.Fatalf("Expected `%v` but got `%v`", expectedAutoscalerImage, actualAutoscalerImage)
}
}

// rayVersion strings for which we judge autoscaler support is not stable and thus
// use the default Ray 2.0.0 image to run the autoscaler.
oldRayVersions := []string{"1", "1.13", "1.13.0"}
for _, rayVersion := range oldRayVersions {
expectedAutoscalerImage := "rayproject/ray:2.0.0"
actualAutoscalerImage := getAutoscalerImage(rayImage, rayVersion)
if actualAutoscalerImage != expectedAutoscalerImage {
t.Fatalf("Expected `%v` but got `%v`", expectedAutoscalerImage, actualAutoscalerImage)
}
}
}

func TestGetHeadPort(t *testing.T) {
headStartParams := make(map[string]string)
actualResult := GetHeadPort(headStartParams)
Expand All @@ -242,7 +267,7 @@ func TestGetHeadPort(t *testing.T) {
headStartParams["port"] = "9999"
actualResult = GetHeadPort(headStartParams)
expectedResult = "9999"
if !(actualResult == expectedResult) {
if actualResult != expectedResult {
t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult)
}
}
Expand Down

0 comments on commit eaa75fa

Please sign in to comment.