From 2aed0077ddea1a6354b78e3ce41f5760ce8bbb94 Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Tue, 29 Aug 2023 16:59:33 -0700 Subject: [PATCH] fix: emit events when node template fails to resolve (#4512) --- cmd/controller/main.go | 1 + go.mod | 2 +- go.sum | 4 +- hack/docs/instancetypes_gen_docs.go | 3 +- pkg/cloudprovider/cloudprovider.go | 15 ++++++- pkg/cloudprovider/events/events.go | 40 +++++++++++++++++++ pkg/cloudprovider/suite_test.go | 9 +++-- .../machine/garbagecollection/suite_test.go | 5 ++- pkg/controllers/machine/link/suite_test.go | 5 ++- pkg/providers/instance/suite_test.go | 5 ++- pkg/providers/instancetype/instancetype.go | 5 ++- pkg/providers/instancetype/suite_test.go | 39 +++++++++++++++++- pkg/providers/launchtemplate/suite_test.go | 3 +- 13 files changed, 122 insertions(+), 14 deletions(-) create mode 100644 pkg/cloudprovider/events/events.go diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 9a9b2d5ff477..0146572b9f2c 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -34,6 +34,7 @@ func main() { awsCloudProvider := cloudprovider.New( op.InstanceTypesProvider, op.InstanceProvider, + op.EventRecorder, op.GetClient(), op.AMIProvider, op.SecurityGroupProvider, diff --git a/go.mod b/go.mod index f03137d01499..c0a956b5e275 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.44.328 - github.com/aws/karpenter-core v0.30.0-rc.0.0.20230824190041-a4c05cede32f + github.com/aws/karpenter-core v0.30.0-rc.0.0.20230829175231-8ad12e5df1cb github.com/imdario/mergo v0.3.16 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.11.0 diff --git a/go.sum b/go.sum index 78902db267d0..96a1b3f51cca 100644 --- a/go.sum +++ b/go.sum @@ -53,8 +53,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.44.328 h1:WBwlf8ym9SDQ/GTIBO9eXyvwappKJyOetWJKl4mT7ZU= github.com/aws/aws-sdk-go v1.44.328/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/karpenter-core v0.30.0-rc.0.0.20230824190041-a4c05cede32f h1:AT2yPv2RzY2rx/GP4aBd+G52FRmx9yXYNz5isH45ThA= -github.com/aws/karpenter-core v0.30.0-rc.0.0.20230824190041-a4c05cede32f/go.mod h1:AQl8m8OtgO2N8IlZlzAU6MTrJTJSbe6K4GwdRUNSJVc= +github.com/aws/karpenter-core v0.30.0-rc.0.0.20230829175231-8ad12e5df1cb h1:JN5JeajfAO/v9ONyXscRTwEIrSQPGMa/Atblpo/iX4A= +github.com/aws/karpenter-core v0.30.0-rc.0.0.20230829175231-8ad12e5df1cb/go.mod h1:AQl8m8OtgO2N8IlZlzAU6MTrJTJSbe6K4GwdRUNSJVc= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A= github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= diff --git a/hack/docs/instancetypes_gen_docs.go b/hack/docs/instancetypes_gen_docs.go index abce6fff4109..878e1f2c0593 100644 --- a/hack/docs/instancetypes_gen_docs.go +++ b/hack/docs/instancetypes_gen_docs.go @@ -87,7 +87,8 @@ func main() { Manager: &FakeManager{}, KubernetesInterface: kubernetes.NewForConfigOrDie(&rest.Config{}), }) - cp := awscloudprovider.New(op.InstanceTypesProvider, op.InstanceProvider, op.GetClient(), op.AMIProvider, op.SecurityGroupProvider, op.SubnetProvider) + cp := awscloudprovider.New(op.InstanceTypesProvider, op.InstanceProvider, + op.EventRecorder, op.GetClient(), op.AMIProvider, op.SecurityGroupProvider, op.SubnetProvider) provider := v1alpha1.AWS{SubnetSelector: map[string]string{ "*": "*", diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index 18dc9b75171a..d076202d2833 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -25,6 +25,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime/schema" + "github.com/aws/karpenter-core/pkg/events" "github.com/aws/karpenter-core/pkg/utils/functional" "github.com/aws/karpenter/pkg/apis" "github.com/aws/karpenter/pkg/apis/v1alpha1" @@ -40,6 +41,7 @@ import ( "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" + cloudproviderevents "github.com/aws/karpenter/pkg/cloudprovider/events" "github.com/aws/karpenter/pkg/providers/amifamily" "github.com/aws/karpenter/pkg/providers/instance" "github.com/aws/karpenter/pkg/providers/instancetype" @@ -65,9 +67,10 @@ type CloudProvider struct { amiProvider *amifamily.Provider securityGroupProvider *securitygroup.Provider subnetProvider *subnet.Provider + recorder events.Recorder } -func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance.Provider, +func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance.Provider, recorder events.Recorder, kubeClient client.Client, amiProvider *amifamily.Provider, securityGroupProvider *securitygroup.Provider, subnetProvider *subnet.Provider) *CloudProvider { return &CloudProvider{ instanceTypeProvider: instanceTypeProvider, @@ -76,6 +79,7 @@ func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance amiProvider: amiProvider, securityGroupProvider: securityGroupProvider, subnetProvider: subnetProvider, + recorder: recorder, } } @@ -85,6 +89,9 @@ func (c *CloudProvider) Create(ctx context.Context, machine *v1alpha5.Machine) ( Annotations[v1alpha5.ProviderCompatabilityAnnotationKey]), machine. Spec.MachineTemplateRef) if err != nil { + if errors.IsNotFound(err) { + c.recorder.Publish(cloudproviderevents.MachineFailedToResolveNodeTemplate(machine)) + } return nil, fmt.Errorf("resolving node template, %w", err) } instanceTypes, err := c.resolveInstanceTypes(ctx, machine, nodeTemplate) @@ -167,6 +174,9 @@ func (c *CloudProvider) GetInstanceTypes(ctx context.Context, provisioner *v1alp } nodeTemplate, err := c.resolveNodeTemplate(ctx, rawProvider, provisioner.Spec.ProviderRef) if err != nil { + if errors.IsNotFound(err) { + c.recorder.Publish(cloudproviderevents.ProvisionerFailedToResolveNodeTemplate(provisioner)) + } return nil, client.IgnoreNotFound(err) } // TODO, break this coupling @@ -200,6 +210,9 @@ func (c *CloudProvider) IsMachineDrifted(ctx context.Context, machine *v1alpha5. } nodeTemplate, err := c.resolveNodeTemplate(ctx, nil, provisioner.Spec.ProviderRef) if err != nil { + if errors.IsNotFound(err) { + c.recorder.Publish(cloudproviderevents.ProvisionerFailedToResolveNodeTemplate(provisioner)) + } return "", client.IgnoreNotFound(fmt.Errorf("resolving node template, %w", err)) } driftReason, err := c.isNodeTemplateDrifted(ctx, machine, provisioner, nodeTemplate) diff --git a/pkg/cloudprovider/events/events.go b/pkg/cloudprovider/events/events.go new file mode 100644 index 000000000000..6b39e3a59b7f --- /dev/null +++ b/pkg/cloudprovider/events/events.go @@ -0,0 +1,40 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package events + +import ( + v1 "k8s.io/api/core/v1" + + "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/aws/karpenter-core/pkg/events" +) + +func ProvisionerFailedToResolveNodeTemplate(provisioner *v1alpha5.Provisioner) events.Event { + return events.Event{ + InvolvedObject: provisioner, + Type: v1.EventTypeWarning, + Message: "Failed resolving AWSNodeTemplate", + DedupeValues: []string{string(provisioner.UID)}, + } +} + +func MachineFailedToResolveNodeTemplate(machine *v1alpha5.Machine) events.Event { + return events.Event{ + InvolvedObject: machine, + Type: v1.EventTypeWarning, + Message: "Failed resolving AWSNodeTemplate", + DedupeValues: []string{string(machine.UID)}, + } +} diff --git a/pkg/cloudprovider/suite_test.go b/pkg/cloudprovider/suite_test.go index d5e0f60a3a62..96b1c7ec0e6a 100644 --- a/pkg/cloudprovider/suite_test.go +++ b/pkg/cloudprovider/suite_test.go @@ -71,6 +71,7 @@ var fakeClock *clock.FakeClock var provisioner *v1alpha5.Provisioner var nodeTemplate *v1alpha1.AWSNodeTemplate var machine *v1alpha5.Machine +var recorder events.Recorder func TestAWS(t *testing.T) { ctx = TestContextWithLogger(t) @@ -85,9 +86,11 @@ var _ = BeforeSuite(func() { ctx, stop = context.WithCancel(ctx) awsEnv = test.NewEnvironment(ctx, env) fakeClock = clock.NewFakeClock(time.Now()) - cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) + recorder = events.NewRecorder(&record.FakeRecorder{}) + cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, recorder, + env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) - prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster) + prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), recorder, cloudProvider, cluster) }) var _ = AfterSuite(func() { @@ -452,7 +455,7 @@ var _ = Describe("CloudProvider", func() { Entry("DetailedMonitoring Drift", v1alpha1.AWSNodeTemplateSpec{DetailedMonitoring: aws.Bool(true)}), Entry("AMIFamily Drift", v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{AMIFamily: aws.String(v1alpha1.AMIFamilyBottlerocket)}}), ) - DescribeTable("should not return drifted if dynamic felids are updated", + DescribeTable("should not return drifted if dynamic fields are updated", func(awsnodetemplatespec v1alpha1.AWSNodeTemplateSpec) { ExpectApplied(ctx, env.Client, provisioner, nodeTemplate) isDrifted, err := cloudProvider.IsMachineDrifted(ctx, machine) diff --git a/pkg/controllers/machine/garbagecollection/suite_test.go b/pkg/controllers/machine/garbagecollection/suite_test.go index 1153e117e34d..ab731887aed2 100644 --- a/pkg/controllers/machine/garbagecollection/suite_test.go +++ b/pkg/controllers/machine/garbagecollection/suite_test.go @@ -28,12 +28,14 @@ import ( "github.com/patrickmn/go-cache" "github.com/samber/lo" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" . "knative.dev/pkg/logging/testing" "sigs.k8s.io/controller-runtime/pkg/client" coresettings "github.com/aws/karpenter-core/pkg/apis/settings" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" corecloudprovider "github.com/aws/karpenter-core/pkg/cloudprovider" + "github.com/aws/karpenter-core/pkg/events" "github.com/aws/karpenter-core/pkg/operator/controller" "github.com/aws/karpenter-core/pkg/operator/scheme" coretest "github.com/aws/karpenter-core/pkg/test" @@ -67,7 +69,8 @@ var _ = BeforeSuite(func() { ctx = settings.ToContext(ctx, test.Settings()) env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...)) awsEnv = test.NewEnvironment(ctx, env) - cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) + cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), + env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) linkedMachineCache = cache.New(time.Minute*10, time.Second*10) linkController := &link.Controller{ Cache: linkedMachineCache, diff --git a/pkg/controllers/machine/link/suite_test.go b/pkg/controllers/machine/link/suite_test.go index be752b33db12..9f3de9c81072 100644 --- a/pkg/controllers/machine/link/suite_test.go +++ b/pkg/controllers/machine/link/suite_test.go @@ -29,11 +29,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/tools/record" . "knative.dev/pkg/logging/testing" "sigs.k8s.io/controller-runtime/pkg/client" coresettings "github.com/aws/karpenter-core/pkg/apis/settings" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/aws/karpenter-core/pkg/events" "github.com/aws/karpenter-core/pkg/operator/controller" "github.com/aws/karpenter-core/pkg/operator/scheme" coretest "github.com/aws/karpenter-core/pkg/test" @@ -66,7 +68,8 @@ var _ = BeforeSuite(func() { ctx = settings.ToContext(ctx, test.Settings()) env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...)) awsEnv = test.NewEnvironment(ctx, env) - cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) + cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), + env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) linkController = link.NewController(env.Client, cloudProvider) }) var _ = AfterSuite(func() { diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index 9c82c8025d3f..9c150151b96b 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -24,11 +24,13 @@ import ( "github.com/samber/lo" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" . "knative.dev/pkg/logging/testing" coresettings "github.com/aws/karpenter-core/pkg/apis/settings" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" corecloudprovider "github.com/aws/karpenter-core/pkg/cloudprovider" + "github.com/aws/karpenter-core/pkg/events" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/operator/options" "github.com/aws/karpenter-core/pkg/operator/scheme" @@ -62,7 +64,8 @@ var _ = BeforeSuite(func() { ctx = coresettings.ToContext(ctx, coretest.Settings()) ctx = settings.ToContext(ctx, test.Settings()) awsEnv = test.NewEnvironment(ctx, env) - cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) + cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), + env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) }) var _ = AfterSuite(func() { diff --git a/pkg/providers/instancetype/instancetype.go b/pkg/providers/instancetype/instancetype.go index 843638f9ba44..6289d2ccbed9 100644 --- a/pkg/providers/instancetype/instancetype.go +++ b/pkg/providers/instancetype/instancetype.go @@ -102,8 +102,11 @@ func (p *Provider) List(ctx context.Context, kc *v1alpha5.KubeletConfiguration, if item, ok := p.cache.Get(key); ok { return item.([]*cloudprovider.InstanceType), nil } - result := lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType { + // Reject any instance types that don't have any offerings due to zone + result := lo.Reject(lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType { return NewInstanceType(ctx, i, kc, p.region, nodeTemplate, p.createOfferings(ctx, i, instanceTypeZones[aws.StringValue(i.InstanceType)])) + }), func(i *cloudprovider.InstanceType, _ int) bool { + return len(i.Offerings) == 0 }) for _, instanceType := range instanceTypes { InstanceTypeVCPU.With(prometheus.Labels{ diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 5a26b9a18659..6649522d5704 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -91,7 +91,8 @@ var _ = BeforeSuite(func() { awsEnv = test.NewEnvironment(ctx, env) fakeClock = &clock.FakeClock{} - cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) + cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), + env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster) }) @@ -1135,6 +1136,42 @@ var _ = Describe("Instance Types", func() { } }) It("shouldn't report more resources than are actually available on instances", func() { + awsEnv.EC2API.DescribeSubnetsOutput.Set(&ec2.DescribeSubnetsOutput{ + Subnets: []*ec2.Subnet{ + { + AvailabilityZone: aws.String("us-west-2a"), + SubnetId: aws.String("subnet-12345"), + }, + }, + }) + awsEnv.EC2API.DescribeInstanceTypeOfferingsOutput.Set(&ec2.DescribeInstanceTypeOfferingsOutput{ + InstanceTypeOfferings: []*ec2.InstanceTypeOffering{ + { + InstanceType: aws.String("t4g.small"), + Location: aws.String("us-west-2a"), + }, + { + InstanceType: aws.String("t4g.medium"), + Location: aws.String("us-west-2a"), + }, + { + InstanceType: aws.String("t4g.xlarge"), + Location: aws.String("us-west-2a"), + }, + { + InstanceType: aws.String("m5.large"), + Location: aws.String("us-west-2a"), + }, + }, + }) + awsEnv.EC2API.DescribeInstanceTypesOutput.Set(&ec2.DescribeInstanceTypesOutput{ + InstanceTypes: []*ec2.InstanceTypeInfo{ + {InstanceType: aws.String("t4g.small")}, + {InstanceType: aws.String("t4g.medium")}, + {InstanceType: aws.String("t4g.xlarge")}, + {InstanceType: aws.String("m5.large")}, + }, + }) ExpectApplied(ctx, env.Client, provisioner, nodeTemplate) its, err := cloudProvider.GetInstanceTypes(ctx, provisioner) diff --git a/pkg/providers/launchtemplate/suite_test.go b/pkg/providers/launchtemplate/suite_test.go index aca4b6baa692..f88a844c0bdd 100644 --- a/pkg/providers/launchtemplate/suite_test.go +++ b/pkg/providers/launchtemplate/suite_test.go @@ -88,7 +88,8 @@ var _ = BeforeSuite(func() { awsEnv = test.NewEnvironment(ctx, env) fakeClock = &clock.FakeClock{} - cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) + cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), + env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider) cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster) })