Skip to content

Commit

Permalink
operator: add more context to reconcile error
Browse files Browse the repository at this point in the history
Previously, operator only reported a current `state` of crashed container at pod.
It may not have all needed information and clear crash reason.

 This commit checks for the lastState of the container condition status. And add logs message with reason.
It also skips running condition states, which should reduce bloat of the log messages.

Related issue:
#1223

Signed-off-by: f41gh7 <[email protected]>
  • Loading branch information
f41gh7 committed Jan 20, 2025
1 parent d16fd0c commit 56a6b8d
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 13 deletions.
4 changes: 2 additions & 2 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ aliases:
* FEATURE: [vmoperator](https://docs.victoriametrics.com/operator/): decrease latency of generated configuration updates. Previously, configuration was update after status of child objects were changed. It could take significant time at large scale. See [this issue](https://github.com/VictoriaMetrics/operator/issues/1220) for details.
* FEATURE: [vmoperator](https://docs.victoriametrics.com/operator/): reduce load on Kubernetes API server at prometheus-converter client.
* FEATURE: [vmoperator](https://docs.victoriametrics.com/operator/): change default value for `client.qps=50` and `client.burst=100` in order to improve operator performance on scale. See [this issue](https://github.com/VictoriaMetrics/operator/issues/1220) for details.
* FEATURE: [vmoperator](https://docs.victoriametrics.com/operator/): add new flag `controller.statusLastUpdateTimeTTL=1h` to control staleness detection at `status.conditions` field. If operator serves large amount of object ( > 5_000) value for it should be increased.
* FEATURE: [vmoperator](https://docs.victoriametrics.com/operator/): add new flag `controller.statusLastUpdateTimeTTL=1h` to control staleness detection at `status.conditions` field. If operator serves large amount of object ( > 5_000) value for it should be increased.
* FEATURE: [vmoperator](https://docs.victoriametrics.com/operator/): add more context to the `failed` status with the reason and logs of crashed container. See [this issue](https://github.com/VictoriaMetrics/operator/issues/1223) for details.

* BUGFIX: [vmagent](https://docs.victoriametrics.com/operator/resources/vmagent/): properly build `relabelConfigs` with empty string values for `separator` and `replacement` fields. See [this issue](https://github.com/VictoriaMetrics/operator/issues/1214) for details.
* BUGFIX: [converter]((https://docs.victoriametrics.com/operator/migration/#objects-conversion)): properly format `regex` single value expression at Prometheus Operator CRD `relabelings` and `metricsRelabelings`. See [this issue](https://github.com/VictoriaMetrics/operator/issues/1219) for details.


## [v0.51.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.51.3)

**Release date:** 8 Jan 2025
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/operator/factory/reconcile/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ func waitDeploymentReady(ctx context.Context, rclient client.Client, dep *appsv1
if isErrDealine {
return err
}
return &errWaitReady{origin: podErr}
return podErr
}
return nil
}
Expand Down
45 changes: 35 additions & 10 deletions internal/controller/operator/factory/reconcile/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,30 +342,55 @@ func waitForPodReady(ctx context.Context, rclient client.Client, ns, podName str
}

func podStatusesToError(origin error, pod *corev1.Pod) error {
var hasCrashedContainers bool
var conditions []string
for _, cond := range pod.Status.Conditions {
conditions = append(conditions, fmt.Sprintf("name=%q,status=%q,message=%q", cond.Type, cond.Status, cond.Message))
conditions = append(conditions, fmt.Sprintf("name=%s,status=%s,message=%s", cond.Type, cond.Status, cond.Message))
}

stateToString := func(state corev1.ContainerState) string {
switch {
case state.Running != nil:
return fmt.Sprintf("running since: %s", state.Running.StartedAt)
case state.Terminated != nil:
return fmt.Sprintf("terminated reason=%q, exit_code=%d", state.Terminated.Message, state.Terminated.ExitCode)
return fmt.Sprintf("terminated message=%s, exit_code=%d, reason=%s", state.Terminated.Message, state.Terminated.ExitCode, state.Terminated.Reason)
case state.Waiting != nil:
return fmt.Sprintf("waiting with reason=%q", state.Waiting.Reason)
return fmt.Sprintf("waiting with reason=%s, message=%s", state.Waiting.Reason, state.Waiting.Message)
}
return "container at waiting state"
return ""
}
for _, condStatus := range pod.Status.ContainerStatuses {
conditions = append(conditions, fmt.Sprintf("name=%q,is_ready=%v,restart_count=%d,state=%s", condStatus.Name, condStatus.Ready, condStatus.RestartCount, stateToString(condStatus.State)))
isCrashed := func(st corev1.ContainerStatus) bool {
if st.RestartCount > 0 && st.LastTerminationState.Terminated != nil && st.State.Waiting != nil {
return true
}
if st.State.Waiting != nil && st.State.Waiting.Reason != "PodInitializing" && st.State.Waiting.Message != "" {
return true
}
return false
}
for _, condStatus := range pod.Status.InitContainerStatuses {
conditions = append(conditions, fmt.Sprintf("name=%q,is_ready=%v,restart_count=%d,state=%s", condStatus.Name, condStatus.Ready, condStatus.RestartCount, stateToString(condStatus.State)))
var containerStates []string
addContainerStatus := func(namePrefix string, css []corev1.ContainerStatus) {
for _, condStatus := range css {
stateMsg := stateToString(condStatus.LastTerminationState)
if stateMsg == "" {
stateMsg = stateToString(condStatus.State)
}
if stateMsg == "" {
continue
}
if isCrashed(condStatus) {
hasCrashedContainers = true
}
containerStates = append(containerStates, fmt.Sprintf("%sname=[%s],is_ready=%v,restart_count=%d,state=%s", namePrefix, condStatus.Name, condStatus.Ready, condStatus.RestartCount, stateMsg))
}
}

return fmt.Errorf("origin_Err=%w,podPhase=%q,conditions=%s", origin, pod.Status.Phase, strings.Join(conditions, ","))
addContainerStatus("", pod.Status.ContainerStatuses)
addContainerStatus("init_container_", pod.Status.InitContainerStatuses)
err := fmt.Errorf("origin_Err=%w,podPhase=%s,pod conditions=%s,pod statuses = %s", origin, pod.Status.Phase, strings.Join(conditions, ","), strings.Join(containerStates, ","))
if hasCrashedContainers {
return err
}
return &errWaitReady{origin: err}
}

func sortStsPodsByID(src []corev1.Pod) error {
Expand Down

0 comments on commit 56a6b8d

Please sign in to comment.