Skip to content

Commit

Permalink
Don’t assign the rayv1.Failed to the State field (ray-project#2258)
Browse files Browse the repository at this point in the history
  • Loading branch information
Yicheng-Lu-llll authored Jul 19, 2024
1 parent 8c64e60 commit ee0a895
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 35 deletions.
3 changes: 2 additions & 1 deletion ray-operator/apis/ray/v1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ type UpscalingMode string
type ClusterState string

const (
Ready ClusterState = "ready"
Ready ClusterState = "ready"
// Failed is deprecated, but we keep it to avoid compilation errors in projects that import the KubeRay Golang module.
Failed ClusterState = "failed"
Suspended ClusterState = "suspended"
)
Expand Down
67 changes: 34 additions & 33 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1146,49 +1146,50 @@ func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcu
}

func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *rayv1.RayCluster, reconcileErr error) (*rayv1.RayCluster, error) {
// TODO: Replace this log and use reconcileErr to set the condition field.
logger := ctrl.LoggerFrom(ctx)
if reconcileErr != nil {
logger.Info("Reconciliation error", "error", reconcileErr)
}

// Deep copy the instance, so we don't mutate the original object.
newInstance := instance.DeepCopy()

if reconcileErr != nil {
newInstance.Status.State = rayv1.Failed
newInstance.Status.Reason = reconcileErr.Error()
} else {
// TODO (kevin85421): ObservedGeneration should be used to determine whether to update this CR or not.
newInstance.Status.ObservedGeneration = newInstance.ObjectMeta.Generation
// TODO (kevin85421): ObservedGeneration should be used to determine whether to update this CR or not.
newInstance.Status.ObservedGeneration = newInstance.ObjectMeta.Generation

runtimePods := corev1.PodList{}
filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: newInstance.Name}
if err := r.List(ctx, &runtimePods, client.InNamespace(newInstance.Namespace), filterLabels); err != nil {
return nil, err
}
runtimePods := corev1.PodList{}
filterLabels := client.MatchingLabels{utils.RayClusterLabelKey: newInstance.Name}
if err := r.List(ctx, &runtimePods, client.InNamespace(newInstance.Namespace), filterLabels); err != nil {
return nil, err
}

newInstance.Status.ReadyWorkerReplicas = utils.CalculateReadyReplicas(runtimePods)
newInstance.Status.AvailableWorkerReplicas = utils.CalculateAvailableReplicas(runtimePods)
newInstance.Status.DesiredWorkerReplicas = utils.CalculateDesiredReplicas(ctx, newInstance)
newInstance.Status.MinWorkerReplicas = utils.CalculateMinReplicas(newInstance)
newInstance.Status.MaxWorkerReplicas = utils.CalculateMaxReplicas(newInstance)
newInstance.Status.ReadyWorkerReplicas = utils.CalculateReadyReplicas(runtimePods)
newInstance.Status.AvailableWorkerReplicas = utils.CalculateAvailableReplicas(runtimePods)
newInstance.Status.DesiredWorkerReplicas = utils.CalculateDesiredReplicas(ctx, newInstance)
newInstance.Status.MinWorkerReplicas = utils.CalculateMinReplicas(newInstance)
newInstance.Status.MaxWorkerReplicas = utils.CalculateMaxReplicas(newInstance)

totalResources := utils.CalculateDesiredResources(newInstance)
newInstance.Status.DesiredCPU = totalResources[corev1.ResourceCPU]
newInstance.Status.DesiredMemory = totalResources[corev1.ResourceMemory]
newInstance.Status.DesiredGPU = sumGPUs(totalResources)
newInstance.Status.DesiredTPU = totalResources[corev1.ResourceName("google.com/tpu")]
totalResources := utils.CalculateDesiredResources(newInstance)
newInstance.Status.DesiredCPU = totalResources[corev1.ResourceCPU]
newInstance.Status.DesiredMemory = totalResources[corev1.ResourceMemory]
newInstance.Status.DesiredGPU = sumGPUs(totalResources)
newInstance.Status.DesiredTPU = totalResources[corev1.ResourceName("google.com/tpu")]

if utils.CheckAllPodsRunning(ctx, runtimePods) {
newInstance.Status.State = rayv1.Ready
}
if utils.CheckAllPodsRunning(ctx, runtimePods) {
newInstance.Status.State = rayv1.Ready
}

if newInstance.Spec.Suspend != nil && *newInstance.Spec.Suspend && len(runtimePods.Items) == 0 {
newInstance.Status.State = rayv1.Suspended
}
if newInstance.Spec.Suspend != nil && *newInstance.Spec.Suspend && len(runtimePods.Items) == 0 {
newInstance.Status.State = rayv1.Suspended
}

if err := r.updateEndpoints(ctx, newInstance); err != nil {
return nil, err
}
if err := r.updateEndpoints(ctx, newInstance); err != nil {
return nil, err
}

if err := r.updateHeadInfo(ctx, newInstance); err != nil {
return nil, err
}
if err := r.updateHeadInfo(ctx, newInstance); err != nil {
return nil, err
}

timeNow := metav1.Now()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1585,7 +1585,7 @@ func TestInconsistentRayClusterStatus(t *testing.T) {

// Case 1: `State` is different => return true
newStatus := oldStatus.DeepCopy()
newStatus.State = rayv1.Failed
newStatus.State = rayv1.Suspended
assert.True(t, r.inconsistentRayClusterStatus(ctx, oldStatus, *newStatus))

// Case 2: `Reason` is different => return true
Expand Down

0 comments on commit ee0a895

Please sign in to comment.