Skip to content

Commit

Permalink
Ensure Cluster topology controller is not stuck when MDs are stuck in…
Browse files Browse the repository at this point in the history
… deletion

Signed-off-by: Stefan Büringer [email protected]
  • Loading branch information
sbueringer committed Jan 29, 2025
1 parent 6d98f39 commit d07153a
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 48 deletions.
94 changes: 50 additions & 44 deletions internal/controllers/topology/cluster/current_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,58 +204,64 @@ func (r *Reconciler) getCurrentMachineDeploymentState(ctx context.Context, bluep
return nil, fmt.Errorf("duplicate MachineDeployment %s found for label %s: %s", klog.KObj(m), clusterv1.ClusterTopologyMachineDeploymentNameLabel, mdTopologyName)
}

// Gets the bootstrapRef.
bootstrapRef := m.Spec.Template.Spec.Bootstrap.ConfigRef
if bootstrapRef == nil {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a Bootstrap Config", klog.KObj(m))
}
// Gets the infraRef.
infraRef := &m.Spec.Template.Spec.InfrastructureRef
if infraRef.Name == "" {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a InfrastructureMachineTemplate", klog.KObj(m))
}
// Skip getting templates for MachineDeployments that are in deleting.
// Note: We don't need these templates for deleting MDs, but also this would likely fail because usually
// the MachineDeployment topology controller deletes the templates as soon as the MD is in deleting.
var bootstrapTemplate, infraMachineTemplate *unstructured.Unstructured
if m.DeletionTimestamp.IsZero() {
// Gets the bootstrapRef.
bootstrapRef := m.Spec.Template.Spec.Bootstrap.ConfigRef
if bootstrapRef == nil {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a Bootstrap Config", klog.KObj(m))
}
// Gets the infraRef.
infraRef := &m.Spec.Template.Spec.InfrastructureRef
if infraRef.Name == "" {
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a InfrastructureMachineTemplate", klog.KObj(m))
}

// If the mdTopology exists in the Cluster, lookup the corresponding mdBluePrint and align
// the apiVersions in the bootstrapRef and infraRef.
// If the mdTopology doesn't exist, do nothing (this can happen if the mdTopology was deleted).
// **Note** We can't check if the MachineDeployment has a DeletionTimestamp, because at this point it could not be set yet.
if mdTopologyExistsInCluster, mdClassName := getMDClassName(cluster, mdTopologyName); mdTopologyExistsInCluster {
mdBluePrint, ok := blueprintMachineDeployments[mdClassName]
if !ok {
return nil, fmt.Errorf("failed to find MachineDeployment class %s in ClusterClass", mdClassName)
// If the mdTopology exists in the Cluster, lookup the corresponding mdBluePrint and align
// the apiVersions in the bootstrapRef and infraRef.
// If the mdTopology doesn't exist, do nothing (this can happen if the mdTopology was deleted).
// **Note** We can't check if the MachineDeployment has a DeletionTimestamp, because at this point it could not be set yet.
if mdTopologyExistsInCluster, mdClassName := getMDClassName(cluster, mdTopologyName); mdTopologyExistsInCluster {
mdBluePrint, ok := blueprintMachineDeployments[mdClassName]
if !ok {
return nil, fmt.Errorf("failed to find MachineDeployment class %s in ClusterClass", mdClassName)
}
bootstrapRef, err = alignRefAPIVersion(mdBluePrint.BootstrapTemplate, bootstrapRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
}
infraRef, err = alignRefAPIVersion(mdBluePrint.InfrastructureMachineTemplate, infraRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
}
}
bootstrapRef, err = alignRefAPIVersion(mdBluePrint.BootstrapTemplate, bootstrapRef)

// Get the BootstrapTemplate.
bootstrapTemplate, err = r.getReference(ctx, bootstrapRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
}
infraRef, err = alignRefAPIVersion(mdBluePrint.InfrastructureMachineTemplate, infraRef)
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(bootstrapTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", bootstrapTemplate.GetKind(), klog.KObj(bootstrapTemplate), klog.KObj(m))
}

// Get the InfraMachineTemplate.
infraMachineTemplate, err = r.getReference(ctx, infraRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
}
}

// Get the BootstrapTemplate.
bootstrapTemplate, err := r.getReference(ctx, bootstrapRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
}
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(bootstrapTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", bootstrapTemplate.GetKind(), klog.KObj(bootstrapTemplate), klog.KObj(m))
}

// Get the InfraMachineTemplate.
infraMachineTemplate, err := r.getReference(ctx, infraRef)
if err != nil {
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
}
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(infraMachineTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", infraMachineTemplate.GetKind(), klog.KObj(infraMachineTemplate), klog.KObj(m))
// check that the referenced object has the ClusterTopologyOwnedLabel label.
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
// owned by the topology.
if !labels.IsTopologyOwned(infraMachineTemplate) {
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", infraMachineTemplate.GetKind(), klog.KObj(infraMachineTemplate), klog.KObj(m))
}
}

// Gets the MachineHealthCheck.
Expand Down
74 changes: 74 additions & 0 deletions internal/controllers/topology/cluster/current_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ import (

"github.com/google/go-cmp/cmp"
. "github.com/onsi/gomega"
"golang.org/x/exp/maps"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
. "sigs.k8s.io/controller-runtime/pkg/envtest/komega"
Expand Down Expand Up @@ -115,6 +117,9 @@ func TestGetCurrentState(t *testing.T) {
WithBootstrapTemplate(machineDeploymentBootstrap).
WithInfrastructureTemplate(machineDeploymentInfrastructure).
Build()
machineDeploymentWithDeletionTimestamp := machineDeployment.DeepCopy()
machineDeploymentWithDeletionTimestamp.Finalizers = []string{clusterv1.MachineDeploymentFinalizer} // required by fake client
machineDeploymentWithDeletionTimestamp.DeletionTimestamp = ptr.To(metav1.Now())
machineDeployment2 := builder.MachineDeployment(metav1.NamespaceDefault, "md2").
WithLabels(map[string]string{
clusterv1.ClusterNameLabel: "cluster1",
Expand Down Expand Up @@ -1056,6 +1061,70 @@ func TestGetCurrentState(t *testing.T) {
MachinePools: emptyMachinePools,
},
},
{
name: "Pass reading a full Cluster with a deleting MachineDeployment",
cluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").
WithInfrastructureCluster(infraCluster).
WithControlPlane(controlPlaneWithInfra).
WithTopology(builder.ClusterTopology().
WithMachineDeployment(clusterv1.MachineDeploymentTopology{
Class: "mdClass",
Name: "md1",
}).
Build()).
Build(),
blueprint: &scope.ClusterBlueprint{
ClusterClass: clusterClassWithControlPlaneInfra,
InfrastructureClusterTemplate: infraClusterTemplate,
ControlPlane: &scope.ControlPlaneBlueprint{
Template: controlPlaneTemplateWithInfrastructureMachine,
InfrastructureMachineTemplate: controlPlaneInfrastructureMachineTemplate,
},
MachineDeployments: map[string]*scope.MachineDeploymentBlueprint{
"mdClass": {
BootstrapTemplate: machineDeploymentBootstrap,
InfrastructureMachineTemplate: machineDeploymentInfrastructure,
},
},
},
objects: []client.Object{
infraCluster,
clusterClassWithControlPlaneInfra,
controlPlaneInfrastructureMachineTemplate,
controlPlaneWithInfra,
machineDeploymentWithDeletionTimestamp,
machineHealthCheckForMachineDeployment,
machineHealthCheckForControlPlane,
},
// Expect valid return of full ClusterState with MachineDeployment without corresponding templates.
want: &scope.ClusterState{
Cluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").
WithInfrastructureCluster(infraCluster).
WithControlPlane(controlPlaneWithInfra).
WithTopology(builder.ClusterTopology().
WithMachineDeployment(clusterv1.MachineDeploymentTopology{
Class: "mdClass",
Name: "md1",
}).
Build()).
Build(),
ControlPlane: &scope.ControlPlaneState{
Object: controlPlaneWithInfra,
InfrastructureMachineTemplate: controlPlaneInfrastructureMachineTemplate,
MachineHealthCheck: machineHealthCheckForControlPlane,
},
InfrastructureCluster: infraCluster,
MachineDeployments: map[string]*scope.MachineDeploymentState{
"md1": {
Object: machineDeploymentWithDeletionTimestamp,
BootstrapTemplate: nil,
InfrastructureMachineTemplate: nil,
MachineHealthCheck: machineHealthCheckForMachineDeployment,
},
},
MachinePools: emptyMachinePools,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down Expand Up @@ -1096,6 +1165,11 @@ func TestGetCurrentState(t *testing.T) {
return
}

// Don't compare the deletionTimestamps as there are some minor differences in how they are stored pre/post fake client.
for _, md := range append(maps.Values(got.MachineDeployments), maps.Values(tt.want.MachineDeployments)...) {
md.Object.DeletionTimestamp = nil
}

// Use EqualObject where the compared object is passed through the fakeClient. Elsewhere the Equal method is
// good enough to establish equality.
g.Expect(got.Cluster).To(EqualObject(tt.want.Cluster, IgnoreAutogeneratedMetadata))
Expand Down
14 changes: 10 additions & 4 deletions internal/controllers/topology/cluster/reconcile_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,10 @@ func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope
}
}

if !currentMD.Object.DeletionTimestamp.IsZero() {
return nil
}

// Return early if the MachineDeployment is pending an upgrade.
// Do not reconcile the MachineDeployment yet to avoid updating the MachineDeployment while it is still pending a
// version upgrade. This will prevent the MachineDeployment from performing a double rollout.
Expand Down Expand Up @@ -809,11 +813,13 @@ func (r *Reconciler) deleteMachineDeployment(ctx context.Context, cluster *clust
return err
}
}
log.Info("Deleting MachineDeployment")
if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(err, "failed to delete MachineDeployment %s", klog.KObj(md.Object))
if md.Object.DeletionTimestamp.IsZero() {
log.Info("Deleting MachineDeployment")
if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(err, "failed to delete MachineDeployment %s", klog.KObj(md.Object))
}
r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted MachineDeployment %q", klog.KObj(md.Object))
}
r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted MachineDeployment %q", klog.KObj(md.Object))
return nil
}

Expand Down

0 comments on commit d07153a

Please sign in to comment.