Skip to content

Commit

Permalink
fix: crashlooping pod should be unhealthy
Browse files Browse the repository at this point in the history
  • Loading branch information
adityathebe authored and moshloop committed Sep 5, 2024
1 parent 6994e74 commit 4838c18
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 0 deletions.
9 changes: 9 additions & 0 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,15 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
switch pod.Status.Phase {
case corev1.PodPending:
for _, ctrStatus := range pod.Status.InitContainerStatuses {
if ctrStatus.LastTerminationState.Terminated != nil && ctrStatus.LastTerminationState.Terminated.Reason == "Error" {
// A pending pod whose container was previously terminated with error should be marked as unhealthy (instead of unknown)
return &HealthStatus{
Health: HealthUnhealthy,
Status: HealthStatusCrashLoopBackoff,
Message: ctrStatus.LastTerminationState.Terminated.Reason,
}, nil
}

if msg := getCommonContainerError(&ctrStatus); msg != nil {
return msg, nil
}
Expand Down
1 change: 1 addition & 0 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-pending.yaml", health.HealthStatusPending, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-running-not-ready.yaml", health.HealthStatusStarting, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-crashloop.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/pod-crashloop-pending.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/pod-imagepullbackoff.yaml", "ImagePullBackOff", health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/pod-error.yaml", health.HealthStatusError, health.HealthUnhealthy, true)
assertAppHealth(t, "./testdata/pod-running-restart-always.yaml", health.HealthStatusRunning, health.HealthHealthy, true)
Expand Down
143 changes: 143 additions & 0 deletions pkg/health/testdata/pod-crashloop-pending.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
apiVersion: v1
kind: Pod
metadata:
uid: 628d9538-5762-4f5a-8c96-2e8c2691b05a
name: crashloop-deployment-6bf9b7b858-m9882
labels:
app: crashloop
pod-template-hash: 6bf9b7b858
namespace: default
generateName: crashloop-deployment-6bf9b7b858-
ownerReferences:
- uid: fa555eeb-bb03-4d84-b02f-756de909ed92
kind: ReplicaSet
name: crashloop-deployment-6bf9b7b858
apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-09-05T12:36:30Z
spec:
volumes:
- name: kube-api-access-5rvpz
projected:
sources:
- serviceAccountToken:
path: token
expirationSeconds: 3607
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
fieldPath: metadata.namespace
apiVersion: v1
defaultMode: 420
nodeName: esr
priority: 0
dnsPolicy: ClusterFirst
containers:
- name: main-container
image: busybox
command:
- sh
- -c
- echo "Main container running"; sleep 3600
resources: {}
volumeMounts:
- name: kube-api-access-5rvpz
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
imagePullPolicy: Always
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
tolerations:
- key: node.kubernetes.io/not-ready
effect: NoExecute
operator: Exists
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
effect: NoExecute
operator: Exists
tolerationSeconds: 300
restartPolicy: Always
schedulerName: default-scheduler
initContainers:
- name: crashloop-init
image: busybox
command:
- sh
- -c
- echo "Init container starting..."; sleep 20; exit 1
resources: {}
volumeMounts:
- name: kube-api-access-5rvpz
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
imagePullPolicy: Always
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
serviceAccount: default
securityContext: {}
preemptionPolicy: PreemptLowerPriority
enableServiceLinks: true
serviceAccountName: default
terminationGracePeriodSeconds: 30
status:
phase: Pending
podIP: 10.42.1.228
hostIP: 10.99.99.9
podIPs:
- ip: 10.42.1.228
qosClass: BestEffort
startTime: 2024-09-05T12:36:30Z
conditions:
- type: Initialized
reason: ContainersNotInitialized
status: "False"
message: "containers with incomplete status: [crashloop-init]"
- type: Ready
reason: ContainersNotReady
status: "False"
message: "containers with unready status: [main-container]"
- type: ContainersReady
reason: ContainersNotReady
status: "False"
message: "containers with unready status: [main-container]"
- type: PodScheduled
status: "True"
containerStatuses:
- name: main-container
image: busybox
ready: false
state:
waiting:
reason: PodInitializing
imageID: ""
started: false
lastState: {}
restartCount: 0
initContainerStatuses:
- name: crashloop-init
image: docker.io/library/busybox:latest
ready: false
state:
terminated:
reason: Error
exitCode: 1
startedAt: 2024-09-05T12:37:04Z
finishedAt: 2024-09-05T12:37:24Z
containerID: containerd://f27823b96f3b484a634279f169b6361656c426dd3e99ef6b43235f794cc47f45
imageID: docker.io/library/busybox@sha256:34b191d63fbc93e25e275bfccf1b5365664e5ac28f06d974e8d50090fbb49f41
lastState:
terminated:
reason: Error
exitCode: 1
startedAt: 2024-09-05T12:36:41Z
finishedAt: 2024-09-05T12:37:01Z
containerID: containerd://fe709cbb96ab0ea44862001df0a9f6298bb775cd8f8bc5ebdd4c4b264e33b69f
containerID: containerd://f27823b96f3b484a634279f169b6361656c426dd3e99ef6b43235f794cc47f45
restartCount: 1

0 comments on commit 4838c18

Please sign in to comment.