Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Ensure Cluster topology controller is not stuck when MDs are stuck in deletion #11771

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 50 additions & 44 deletions internal/controllers/topology/cluster/current_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,58 +204,64 @@ func (r *Reconciler) getCurrentMachineDeploymentState(ctx context.Context, bluep
return nil, fmt.Errorf("duplicate MachineDeployment %s found for label %s: %s", klog.KObj(m), clusterv1.ClusterTopologyMachineDeploymentNameLabel, mdTopologyName)
}

// Gets the bootstrapRef.
bootstrapRef := m.Spec.Template.Spec.Bootstrap.ConfigRef
if bootstrapRef == nil {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a Bootstrap Config", klog.KObj(m))
}
// Gets the infraRef.
infraRef := &m.Spec.Template.Spec.InfrastructureRef
if infraRef.Name == "" {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a InfrastructureMachineTemplate", klog.KObj(m))
}
// Skip getting templates for MachineDeployments that are in deleting.
// Note: We don't need these templates for deleting MDs, but also this would likely fail because usually
// the MachineDeployment topology controller deletes the templates as soon as the MD is in deleting.
var bootstrapTemplate, infraMachineTemplate *unstructured.Unstructured
if m.DeletionTimestamp.IsZero() {
// Gets the bootstrapRef.
bootstrapRef := m.Spec.Template.Spec.Bootstrap.ConfigRef
if bootstrapRef == nil {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a Bootstrap Config", klog.KObj(m))
}
// Gets the infraRef.
infraRef := &m.Spec.Template.Spec.InfrastructureRef
if infraRef.Name == "" {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a InfrastructureMachineTemplate", klog.KObj(m))
}

// If the mdTopology exists in the Cluster, lookup the corresponding mdBluePrint and align
// the apiVersions in the bootstrapRef and infraRef.
// If the mdTopology doesn't exist, do nothing (this can happen if the mdTopology was deleted).
// **Note** We can't check if the MachineDeployment has a DeletionTimestamp, because at this point it could not be set yet.
if mdTopologyExistsInCluster, mdClassName := getMDClassName(cluster, mdTopologyName); mdTopologyExistsInCluster {
mdBluePrint, ok := blueprintMachineDeployments[mdClassName]
if !ok {
return nil, fmt.Errorf("failed to find MachineDeployment class %s in ClusterClass", mdClassName)
// If the mdTopology exists in the Cluster, lookup the corresponding mdBluePrint and align
// the apiVersions in the bootstrapRef and infraRef.
// If the mdTopology doesn't exist, do nothing (this can happen if the mdTopology was deleted).
// **Note** We can't check if the MachineDeployment has a DeletionTimestamp, because at this point it could not be set yet.
if mdTopologyExistsInCluster, mdClassName := getMDClassName(cluster, mdTopologyName); mdTopologyExistsInCluster {
mdBluePrint, ok := blueprintMachineDeployments[mdClassName]
if !ok {
return nil, fmt.Errorf("failed to find MachineDeployment class %s in ClusterClass", mdClassName)
}
bootstrapRef, err = alignRefAPIVersion(mdBluePrint.BootstrapTemplate, bootstrapRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
}
infraRef, err = alignRefAPIVersion(mdBluePrint.InfrastructureMachineTemplate, infraRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
}
}
bootstrapRef, err = alignRefAPIVersion(mdBluePrint.BootstrapTemplate, bootstrapRef)

// Get the BootstrapTemplate.
bootstrapTemplate, err = r.getReference(ctx, bootstrapRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
}
infraRef, err = alignRefAPIVersion(mdBluePrint.InfrastructureMachineTemplate, infraRef)
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(bootstrapTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", bootstrapTemplate.GetKind(), klog.KObj(bootstrapTemplate), klog.KObj(m))
}

// Get the InfraMachineTemplate.
infraMachineTemplate, err = r.getReference(ctx, infraRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
}
}

// Get the BootstrapTemplate.
bootstrapTemplate, err := r.getReference(ctx, bootstrapRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
}
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(bootstrapTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", bootstrapTemplate.GetKind(), klog.KObj(bootstrapTemplate), klog.KObj(m))
}

// Get the InfraMachineTemplate.
infraMachineTemplate, err := r.getReference(ctx, infraRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
}
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(infraMachineTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", infraMachineTemplate.GetKind(), klog.KObj(infraMachineTemplate), klog.KObj(m))
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(infraMachineTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", infraMachineTemplate.GetKind(), klog.KObj(infraMachineTemplate), klog.KObj(m))
}
}

// Gets the MachineHealthCheck.
Expand Down
74 changes: 74 additions & 0 deletions internal/controllers/topology/cluster/current_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ import (

"github.com/google/go-cmp/cmp"
. "github.com/onsi/gomega"
"golang.org/x/exp/maps"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
. "sigs.k8s.io/controller-runtime/pkg/envtest/komega"
Expand Down Expand Up @@ -115,6 +117,9 @@ func TestGetCurrentState(t *testing.T) {
WithBootstrapTemplate(machineDeploymentBootstrap).
WithInfrastructureTemplate(machineDeploymentInfrastructure).
Build()
machineDeploymentWithDeletionTimestamp := machineDeployment.DeepCopy()
machineDeploymentWithDeletionTimestamp.Finalizers = []string{clusterv1.MachineDeploymentFinalizer} // required by fake client
machineDeploymentWithDeletionTimestamp.DeletionTimestamp = ptr.To(metav1.Now())
machineDeployment2 := builder.MachineDeployment(metav1.NamespaceDefault, "md2").
WithLabels(map[string]string{
clusterv1.ClusterNameLabel: "cluster1",
Expand Down Expand Up @@ -1056,6 +1061,70 @@ func TestGetCurrentState(t *testing.T) {
MachinePools: emptyMachinePools,
},
},
{
name: "Pass reading a full Cluster with a deleting MachineDeployment",
cluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").
WithInfrastructureCluster(infraCluster).
WithControlPlane(controlPlaneWithInfra).
WithTopology(builder.ClusterTopology().
WithMachineDeployment(clusterv1.MachineDeploymentTopology{
Class: "mdClass",
Name: "md1",
}).
Build()).
Build(),
blueprint: &scope.ClusterBlueprint{
ClusterClass: clusterClassWithControlPlaneInfra,
InfrastructureClusterTemplate: infraClusterTemplate,
ControlPlane: &scope.ControlPlaneBlueprint{
Template: controlPlaneTemplateWithInfrastructureMachine,
InfrastructureMachineTemplate: controlPlaneInfrastructureMachineTemplate,
},
MachineDeployments: map[string]*scope.MachineDeploymentBlueprint{
"mdClass": {
BootstrapTemplate: machineDeploymentBootstrap,
InfrastructureMachineTemplate: machineDeploymentInfrastructure,
},
},
},
objects: []client.Object{
infraCluster,
clusterClassWithControlPlaneInfra,
controlPlaneInfrastructureMachineTemplate,
controlPlaneWithInfra,
machineDeploymentWithDeletionTimestamp,
machineHealthCheckForMachineDeployment,
machineHealthCheckForControlPlane,
},
// Expect valid return of full ClusterState with MachineDeployment without corresponding templates.
want: &scope.ClusterState{
Cluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").
WithInfrastructureCluster(infraCluster).
WithControlPlane(controlPlaneWithInfra).
WithTopology(builder.ClusterTopology().
WithMachineDeployment(clusterv1.MachineDeploymentTopology{
Class: "mdClass",
Name: "md1",
}).
Build()).
Build(),
ControlPlane: &scope.ControlPlaneState{
Object: controlPlaneWithInfra,
InfrastructureMachineTemplate: controlPlaneInfrastructureMachineTemplate,
MachineHealthCheck: machineHealthCheckForControlPlane,
},
InfrastructureCluster: infraCluster,
MachineDeployments: map[string]*scope.MachineDeploymentState{
"md1": {
Object: machineDeploymentWithDeletionTimestamp,
BootstrapTemplate: nil,
InfrastructureMachineTemplate: nil,
MachineHealthCheck: machineHealthCheckForMachineDeployment,
},
},
MachinePools: emptyMachinePools,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down Expand Up @@ -1096,6 +1165,11 @@ func TestGetCurrentState(t *testing.T) {
return
}

// Don't compare the deletionTimestamps as there are some minor differences in how they are stored pre/post fake client.
for _, md := range append(maps.Values(got.MachineDeployments), maps.Values(tt.want.MachineDeployments)...) {
md.Object.DeletionTimestamp = nil
}

// Use EqualObject where the compared object is passed through the fakeClient. Elsewhere the Equal method is
// good enough to establish equality.
g.Expect(got.Cluster).To(EqualObject(tt.want.Cluster, IgnoreAutogeneratedMetadata))
Expand Down
14 changes: 10 additions & 4 deletions internal/controllers/topology/cluster/reconcile_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,10 @@ func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope
}
}

if !currentMD.Object.DeletionTimestamp.IsZero() {
return nil
}

// Return early if the MachineDeployment is pending an upgrade.
// Do not reconcile the MachineDeployment yet to avoid updating the MachineDeployment while it is still pending a
// version upgrade. This will prevent the MachineDeployment from performing a double rollout.
Expand Down Expand Up @@ -809,11 +813,13 @@ func (r *Reconciler) deleteMachineDeployment(ctx context.Context, cluster *clust
return err
}
}
log.Info("Deleting MachineDeployment")
if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(err, "failed to delete MachineDeployment %s", klog.KObj(md.Object))
if md.Object.DeletionTimestamp.IsZero() {
log.Info("Deleting MachineDeployment")
if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(err, "failed to delete MachineDeployment %s", klog.KObj(md.Object))
}
r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted MachineDeployment %q", klog.KObj(md.Object))
}
r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted MachineDeployment %q", klog.KObj(md.Object))
return nil
}

Expand Down