-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
prevent driver pod from being deleted before its status is processed by the operator (#2054) #2076
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,7 @@ import ( | |
crdlisters "github.com/kubeflow/spark-operator/pkg/client/listers/sparkoperator.k8s.io/v1beta2" | ||
"github.com/kubeflow/spark-operator/pkg/config" | ||
"github.com/kubeflow/spark-operator/pkg/util" | ||
"github.com/kubeflow/spark-operator/pkg/webhook" | ||
) | ||
|
||
const ( | ||
|
@@ -613,6 +614,9 @@ func (c *Controller) syncSparkApplication(key string) error { | |
return err | ||
} | ||
case v1beta2.CompletedState, v1beta2.FailedState: | ||
if err := c.removeDriverPodFinalizer(app); err != nil { | ||
return err | ||
} | ||
if c.hasApplicationExpired(app) { | ||
glog.Infof("Garbage collecting expired SparkApplication %s/%s", app.Namespace, app.Name) | ||
err := c.crdClient.SparkoperatorV1beta2().SparkApplications(app.Namespace).Delete(context.TODO(), app.Name, metav1.DeleteOptions{GracePeriodSeconds: int64ptr(0)}) | ||
|
@@ -893,6 +897,10 @@ func (c *Controller) deleteSparkResources(app *v1beta2.SparkApplication) error { | |
driverPodName = getDriverPodName(app) | ||
} | ||
|
||
if err := c.removeDriverPodFinalizer(app); err != nil { | ||
return fmt.Errorf("delete spark resource, %w", err) | ||
} | ||
|
||
glog.V(2).Infof("Deleting pod %s in namespace %s", driverPodName, app.Namespace) | ||
err := c.kubeClient.CoreV1().Pods(app.Namespace).Delete(context.TODO(), driverPodName, metav1.DeleteOptions{}) | ||
if err != nil && !errors.IsNotFound(err) { | ||
|
@@ -1125,6 +1133,36 @@ func (c *Controller) cleanUpOnTermination(oldApp, newApp *v1beta2.SparkApplicati | |
return nil | ||
} | ||
|
||
func (c *Controller) removeDriverPodFinalizer(app *v1beta2.SparkApplication) error { | ||
driverPodName := app.Status.DriverInfo.PodName | ||
if driverPodName == "" { | ||
driverPodName = getDriverPodName(app) | ||
} | ||
pod, err := c.kubeClient.CoreV1().Pods(app.Namespace).Get(context.TODO(), driverPodName, metav1.GetOptions{}) | ||
if errors.IsNotFound(err) { | ||
return nil | ||
} | ||
if err != nil { | ||
return fmt.Errorf("get driver pod %s failed, %w", driverPodName, err) | ||
} | ||
oldFinalizer := pod.Finalizers | ||
var newFinalizer []string | ||
for _, finalizer := range oldFinalizer { | ||
if finalizer != webhook.DriverFinalize { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: Do we need a check to verify that the web hook is enabled? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @peter-mcclonski , glad to your review, if webhook is disabled, driver pod will not contains the finalizer, so it does not need a check |
||
newFinalizer = append(newFinalizer, finalizer) | ||
} | ||
} | ||
if len(oldFinalizer) != len(newFinalizer) { | ||
pod.Finalizers = newFinalizer | ||
_, err := c.kubeClient.CoreV1().Pods(app.Namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}) | ||
if err != nil { | ||
return fmt.Errorf("remove driver pod finalizer failed, %w", err) | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func int64ptr(n int64) *int64 { | ||
return &n | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am wondering when you try to delete a running SparkApplication, whether spark operator will need to wait the app to be completed or failed, and then it can remove the finalizer. If this is true, for a long running Spark job, like Spark streaming job, maybe it will never turned into completed or failed state and the finalizer will never be removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right, finalizer is used for controling pod deletion, if the streaming driver pod is running, the owner spark job will never turn into completed or failed state, but if driver pod is deleted manually, or become status out of pod pending and pod running status, the streaming sparkapp will finally turn into completed or failed, at that time, driver finalizer will be deleted