Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: invalidate SSM cache upon AMI deprecation #7301

Merged
merged 3 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.16.5
name: ec2nodeclasses.karpenter.k8s.aws
spec:
group: karpenter.k8s.aws
Expand Down
1 change: 1 addition & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func main() {
op.GetClient(),
op.EventRecorder,
op.UnavailableOfferingsCache,
op.SSMCache,
jonathan-innis marked this conversation as resolved.
Show resolved Hide resolved
cloudProvider,
op.SubnetProvider,
op.SecurityGroupProvider,
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.16.5
name: ec2nodeclasses.karpenter.k8s.aws
spec:
group: karpenter.k8s.aws
Expand Down
36 changes: 30 additions & 6 deletions pkg/apis/v1/ec2nodeclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,16 +484,40 @@ func (in *EC2NodeClass) AMIFamily() string {
if in.Spec.AMIFamily != nil {
return *in.Spec.AMIFamily
}
if term, ok := lo.Find(in.Spec.AMISelectorTerms, func(t AMISelectorTerm) bool {
return t.Alias != ""
}); ok {
return AMIFamilyFromAlias(term.Alias)
if alias := in.Alias(); alias != nil {
return alias.Family
}
// Unreachable: validation enforces that one of the above conditions must be met
return AMIFamilyCustom
}

func AMIFamilyFromAlias(alias string) string {
type Alias struct {
jonathan-innis marked this conversation as resolved.
Show resolved Hide resolved
Family string
Version string
}

const (
AliasVersionLatest = "latest"
)

func (a *Alias) String() string {
return fmt.Sprintf("%s@%s", a.Family, a.Version)
}

func (in *EC2NodeClass) Alias() *Alias {
term, ok := lo.Find(in.Spec.AMISelectorTerms, func(term AMISelectorTerm) bool {
return term.Alias != ""
})
if !ok {
return nil
}
return &Alias{
Family: amiFamilyFromAlias(term.Alias),
Version: amiVersionFromAlias(term.Alias),
}
}

func amiFamilyFromAlias(alias string) string {
components := strings.Split(alias, "@")
if len(components) != 2 {
log.Fatalf("failed to parse AMI alias %q, invalid format", alias)
Expand All @@ -513,7 +537,7 @@ func AMIFamilyFromAlias(alias string) string {
return family
}

func AMIVersionFromAlias(alias string) string {
func amiVersionFromAlias(alias string) string {
components := strings.Split(alias, "@")
if len(components) != 2 {
log.Fatalf("failed to parse AMI alias %q, invalid format", alias)
Expand Down
15 changes: 15 additions & 0 deletions pkg/apis/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pkg/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ const (
AssociatePublicIPAddressTTL = 5 * time.Minute
// SSMGetParametersByPathTTL is the time to drop SSM Parameters by path data. This only queries EKS Optimized AMI
// releases, so we should expect this to be updated relatively infrequently.
SSMGetParametersByPathTTL = 24 * time.Hour
SSMCacheTTL = 24 * time.Hour
// DiscoveredCapacityCacheTTL is the time to drop discovered resource capacity data per-instance type
// if it is not updated by a node creation event or refreshed during controller reconciliation
DiscoveredCapacityCacheTTL = 60 * 24 * time.Hour
Expand Down
29 changes: 23 additions & 6 deletions pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

"github.com/awslabs/operatorpkg/controller"
"github.com/awslabs/operatorpkg/status"
"github.com/patrickmn/go-cache"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/karpenter/pkg/cloudprovider"

Expand All @@ -29,6 +30,7 @@ import (
controllersinstancetype "github.com/aws/karpenter-provider-aws/pkg/controllers/providers/instancetype"
controllersinstancetypecapacity "github.com/aws/karpenter-provider-aws/pkg/controllers/providers/instancetype/capacity"
controllerspricing "github.com/aws/karpenter-provider-aws/pkg/controllers/providers/pricing"
ssminvalidation "github.com/aws/karpenter-provider-aws/pkg/controllers/providers/ssm/invalidation"
"github.com/aws/karpenter-provider-aws/pkg/providers/launchtemplate"

servicesqs "github.com/aws/aws-sdk-go-v2/service/sqs"
Expand All @@ -39,7 +41,7 @@ import (

"sigs.k8s.io/karpenter/pkg/events"

"github.com/aws/karpenter-provider-aws/pkg/cache"
awscache "github.com/aws/karpenter-provider-aws/pkg/cache"
"github.com/aws/karpenter-provider-aws/pkg/controllers/interruption"
nodeclaimgarbagecollection "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclaim/garbagecollection"
nodeclaimtagging "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclaim/tagging"
Expand All @@ -56,11 +58,25 @@ import (
config "github.com/aws/aws-sdk-go-v2/config"
)

func NewControllers(ctx context.Context, mgr manager.Manager, sess *session.Session, clk clock.Clock, kubeClient client.Client, recorder events.Recorder,
unavailableOfferings *cache.UnavailableOfferings, cloudProvider cloudprovider.CloudProvider, subnetProvider subnet.Provider,
securityGroupProvider securitygroup.Provider, instanceProfileProvider instanceprofile.Provider, instanceProvider instance.Provider,
pricingProvider pricing.Provider, amiProvider amifamily.Provider, launchTemplateProvider launchtemplate.Provider, instanceTypeProvider *instancetype.DefaultProvider) []controller.Controller {

func NewControllers(
ctx context.Context,
mgr manager.Manager,
sess *session.Session,
clk clock.Clock,
kubeClient client.Client,
recorder events.Recorder,
unavailableOfferings *awscache.UnavailableOfferings,
ssmCache *cache.Cache,
cloudProvider cloudprovider.CloudProvider,
subnetProvider subnet.Provider,
securityGroupProvider securitygroup.Provider,
instanceProfileProvider instanceprofile.Provider,
instanceProvider instance.Provider,
pricingProvider pricing.Provider,
amiProvider amifamily.Provider,
launchTemplateProvider launchtemplate.Provider,
instanceTypeProvider *instancetype.DefaultProvider,
) []controller.Controller {
controllers := []controller.Controller{
nodeclasshash.NewController(kubeClient),
nodeclassstatus.NewController(kubeClient, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider, launchTemplateProvider),
Expand All @@ -70,6 +86,7 @@ func NewControllers(ctx context.Context, mgr manager.Manager, sess *session.Sess
controllerspricing.NewController(pricingProvider),
controllersinstancetype.NewController(instanceTypeProvider),
controllersinstancetypecapacity.NewController(kubeClient, instanceTypeProvider),
ssminvalidation.NewController(ssmCache, amiProvider),
status.NewController[*v1.EC2NodeClass](kubeClient, mgr.GetEventRecorderFor("karpenter")),
}
if options.FromContext(ctx).InterruptionQueue != "" {
Expand Down
95 changes: 95 additions & 0 deletions pkg/controllers/providers/ssm/invalidation/controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package invalidation

import (
"context"
"time"

"github.com/awslabs/operatorpkg/singleton"
"github.com/patrickmn/go-cache"
"github.com/samber/lo"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/karpenter/pkg/operator/injection"

v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1"
"github.com/aws/karpenter-provider-aws/pkg/providers/amifamily"
"github.com/aws/karpenter-provider-aws/pkg/providers/ssm"
)

// The SSM Invalidation controller is responsible for invalidating "latest" SSM parameters when they point to deprecated
// AMIs. This can occur when an EKS-optimized AMI with a regression is released, and the AMI team chooses to deprecate
jmdeal marked this conversation as resolved.
Show resolved Hide resolved
// the AMI. Normally, SSM parameter cache entries expire after 24 hours to prevent a thundering herd upon a new AMI
// release, however Karpenter should react faster when an AMI is deprecated. This controller will ensure Karpenter
// reacts to AMI deprecations within it's polling period (30m).
type Controller struct {
cache *cache.Cache
amiProvider amifamily.Provider
}

func NewController(ssmCache *cache.Cache, amiProvider amifamily.Provider) *Controller {
return &Controller{
cache: ssmCache,
amiProvider: amiProvider,
}
}

func (c *Controller) Name() string {
return "providers.ssm.invalidation"
}

func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, c.Name())

amiIDsToParameters := map[string]ssm.Parameter{}
for _, item := range c.cache.Items() {
jonathan-innis marked this conversation as resolved.
Show resolved Hide resolved
entry := item.Object.(ssm.CacheEntry)
if !entry.Parameter.IsMutable {
continue
}
amiIDsToParameters[entry.Value] = entry.Parameter
}
amis := []amifamily.AMI{}
for _, nodeClass := range lo.Map(lo.Keys(amiIDsToParameters), func(amiID string, _ int) *v1.EC2NodeClass {
return &v1.EC2NodeClass{
Spec: v1.EC2NodeClassSpec{
AMISelectorTerms: []v1.AMISelectorTerm{{ID: amiID}},
},
}
}) {
resolvedAMIs, err := c.amiProvider.List(ctx, nodeClass)
jonathan-innis marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return reconcile.Result{}, err
}
amis = append(amis, resolvedAMIs...)
}
for _, ami := range amis {
if !ami.Deprecated {
continue
}
parameter := amiIDsToParameters[ami.AmiID]
c.cache.Delete(parameter.CacheKey())
}
return reconcile.Result{RequeueAfter: 30 * time.Minute}, nil
}

func (c *Controller) Register(_ context.Context, m manager.Manager) error {
return controllerruntime.NewControllerManagedBy(m).
Named(c.Name()).
WatchesRawSource(singleton.Source()).
Complete(singleton.AsReconciler(c))
}
Loading
Loading