From 1bf1f80f18d20c65f07ffb2cf20a21ce606acf13 Mon Sep 17 00:00:00 2001 From: Faixan Date: Sat, 2 Mar 2024 14:12:40 -0800 Subject: [PATCH] fix: Update spot pricing even in Isolated VPC (#5704) Co-authored-by: Jonathan Innis --- pkg/controllers/controllers.go | 7 +-- pkg/operator/options/options.go | 2 +- pkg/providers/pricing/controller.go | 3 +- pkg/providers/pricing/pricing.go | 9 ++++ pkg/providers/pricing/suite_test.go | 40 ++++++++++++++ website/content/en/docs/reference/settings.md | 52 +++++++++---------- .../content/en/preview/reference/settings.md | 2 +- 7 files changed, 80 insertions(+), 35 deletions(-) diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 01fd3b1f3ef0..0cbe45c17053 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -25,7 +25,6 @@ import ( servicesqs "github.com/aws/aws-sdk-go/service/sqs" "github.com/samber/lo" "k8s.io/utils/clock" - "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/karpenter/pkg/events" @@ -55,14 +54,10 @@ func NewControllers(ctx context.Context, sess *session.Session, clk clock.Clock, nodeclass.NewController(kubeClient, recorder, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider, launchTemplateProvider), nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider), nodeclaimtagging.NewController(kubeClient, instanceProvider), + pricing.NewController(pricingProvider), } if options.FromContext(ctx).InterruptionQueue != "" { controllers = append(controllers, interruption.NewController(kubeClient, clk, recorder, lo.Must(sqs.NewProvider(ctx, servicesqs.New(sess), options.FromContext(ctx).InterruptionQueue)), unavailableOfferings)) } - if options.FromContext(ctx).IsolatedVPC { - logging.FromContext(ctx).Infof("assuming isolated VPC, pricing information will not be updated") - } else { - controllers = append(controllers, pricing.NewController(pricingProvider)) - } return controllers } diff --git a/pkg/operator/options/options.go b/pkg/operator/options/options.go index 0aad85f970d6..84d9c38c5268 100644 --- a/pkg/operator/options/options.go +++ b/pkg/operator/options/options.go @@ -50,7 +50,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) { fs.StringVar(&o.ClusterCABundle, "cluster-ca-bundle", env.WithDefaultString("CLUSTER_CA_BUNDLE", ""), "Cluster CA bundle for nodes to use for TLS connections with the API server. If not set, this is taken from the controller's TLS configuration.") fs.StringVar(&o.ClusterName, "cluster-name", env.WithDefaultString("CLUSTER_NAME", ""), "[REQUIRED] The kubernetes cluster name for resource discovery.") fs.StringVar(&o.ClusterEndpoint, "cluster-endpoint", env.WithDefaultString("CLUSTER_ENDPOINT", ""), "The external kubernetes cluster endpoint for new nodes to connect with. If not specified, will discover the cluster endpoint using DescribeCluster API.") - fs.BoolVarWithEnv(&o.IsolatedVPC, "isolated-vpc", "ISOLATED_VPC", false, "If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS pricing endpoint.") + fs.BoolVarWithEnv(&o.IsolatedVPC, "isolated-vpc", "ISOLATED_VPC", false, "If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.") fs.Float64Var(&o.VMMemoryOverheadPercent, "vm-memory-overhead-percent", env.WithDefaultFloat64("VM_MEMORY_OVERHEAD_PERCENT", 0.075), "The VM memory overhead as a percent that will be subtracted from the total memory for all instance types.") fs.StringVar(&o.InterruptionQueue, "interruption-queue", env.WithDefaultString("INTERRUPTION_QUEUE", ""), "Interruption queue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.") fs.IntVar(&o.ReservedENIs, "reserved-enis", env.WithDefaultInt("RESERVED_ENIS", 0), "Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html.") diff --git a/pkg/providers/pricing/controller.go b/pkg/providers/pricing/controller.go index 94ed0138181b..f7de883d5bd6 100644 --- a/pkg/providers/pricing/controller.go +++ b/pkg/providers/pricing/controller.go @@ -50,9 +50,10 @@ func (c *Controller) Builder(_ context.Context, m manager.Manager) corecontrolle func (c *Controller) updatePricing(ctx context.Context) error { work := []func(ctx context.Context) error{ - c.pricingProvider.UpdateOnDemandPricing, c.pricingProvider.UpdateSpotPricing, + c.pricingProvider.UpdateOnDemandPricing, } + errs := make([]error, len(work)) lop.ForEach(work, func(f func(ctx context.Context) error, i int) { if err := f(ctx); err != nil { diff --git a/pkg/providers/pricing/pricing.go b/pkg/providers/pricing/pricing.go index e46cff66f9bd..ee97df2d9594 100644 --- a/pkg/providers/pricing/pricing.go +++ b/pkg/providers/pricing/pricing.go @@ -25,6 +25,8 @@ import ( "sync" "time" + "github.com/aws/karpenter-provider-aws/pkg/operator/options" + "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" @@ -148,6 +150,13 @@ func (p *Provider) UpdateOnDemandPricing(ctx context.Context) error { var onDemandPrices, onDemandMetalPrices map[string]float64 var onDemandErr, onDemandMetalErr error + // if we are in isolated vpc, skip updating on demand pricing + // as pricing api may not be available + if options.FromContext(ctx).IsolatedVPC { + logging.FromContext(ctx).Infof("assuming isolated VPC, on-demand pricing information will not be updated") + return nil + } + wg.Add(1) go func() { defer wg.Done() diff --git a/pkg/providers/pricing/suite_test.go b/pkg/providers/pricing/suite_test.go index 7dd24e29283d..0c5a21d12a28 100644 --- a/pkg/providers/pricing/suite_test.go +++ b/pkg/providers/pricing/suite_test.go @@ -262,6 +262,46 @@ var _ = Describe("Pricing", func() { Expect(lo.Map(inp.ProductDescriptions, func(x *string, _ int) string { return *x })). To(ContainElements("Linux/UNIX", "Linux/UNIX (Amazon VPC)")) }) + It("should return static on-demand data when in isolated-vpc", func() { + ctx = options.ToContext(ctx, test.Options(test.OptionsFields{ + IsolatedVPC: lo.ToPtr(true), + })) + now := time.Now() + awsEnv.EC2API.DescribeSpotPriceHistoryOutput.Set(&ec2.DescribeSpotPriceHistoryOutput{ + SpotPriceHistory: []*ec2.SpotPrice{ + { + AvailabilityZone: aws.String("test-zone-1b"), + InstanceType: aws.String("c99.large"), + SpotPrice: aws.String("1.50"), + Timestamp: &now, + }, + { + AvailabilityZone: aws.String("test-zone-1b"), + InstanceType: aws.String("c98.large"), + SpotPrice: aws.String("1.10"), + Timestamp: &now, + }, + }, + }) + + awsEnv.PricingAPI.GetProductsOutput.Set(&awspricing.GetProductsOutput{ + // these are incorrect prices which are here to ensure that + // results from only static pricing are used + PriceList: []aws.JSONValue{ + fake.NewOnDemandPrice("c3.2xlarge", 1.20), + fake.NewOnDemandPrice("c5.xlarge", 1.23), + }, + }) + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + price, ok := awsEnv.PricingProvider.OnDemandPrice("c3.2xlarge") + Expect(ok).To(BeTrue()) + Expect(price).To(BeNumerically("==", 0.420000)) + + price, ok = awsEnv.PricingProvider.SpotPrice("c98.large", "test-zone-1b") + Expect(ok).To(BeTrue()) + Expect(price).To(BeNumerically("==", 1.10)) + Expect(getPricingEstimateMetricValue("c98.large", ec2.UsageClassTypeSpot, "test-zone-1b")).To(BeNumerically("==", 1.10)) + }) }) func getPricingEstimateMetricValue(instanceType string, capacityType string, zone string) float64 { diff --git a/website/content/en/docs/reference/settings.md b/website/content/en/docs/reference/settings.md index 4150586483ea..ed828351a181 100644 --- a/website/content/en/docs/reference/settings.md +++ b/website/content/en/docs/reference/settings.md @@ -10,32 +10,32 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf [comment]: <> (the content below is generated from hack/docs/configuration_gen_docs.go) -| Environment Variable | CLI Flag | Description | -|--|--|--| -| ASSUME_ROLE_ARN | \-\-assume-role-arn | Role to assume for calling AWS services.| -| ASSUME_ROLE_DURATION | \-\-assume-role-duration | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRole set. (default = 15m0s)| -| BATCH_IDLE_DURATION | \-\-batch-idle-duration | The maximum amount of time with no new pending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. (default = 1s)| -| BATCH_MAX_DURATION | \-\-batch-max-duration | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. (default = 10s)| -| CLUSTER_CA_BUNDLE | \-\-cluster-ca-bundle | Cluster CA bundle for nodes to use for TLS connections with the API server. If not set, this is taken from the controller's TLS configuration.| -| CLUSTER_ENDPOINT | \-\-cluster-endpoint | The external kubernetes cluster endpoint for new nodes to connect with. If not specified, will discover the cluster endpoint using DescribeCluster API.| -| CLUSTER_NAME | \-\-cluster-name | [REQUIRED] The kubernetes cluster name for resource discovery.| -| DISABLE_WEBHOOK | \-\-disable-webhook | Disable the admission and validation webhooks| -| ENABLE_PROFILING | \-\-enable-profiling | Enable the profiling on the metric endpoint| -| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift,SpotToSpotConsolidation (default = Drift=true,SpotToSpotConsolidation=false)| -| HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)| -| INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.| -| ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS pricing endpoint.| -| KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate| -| KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300)| -| KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200)| -| LEADER_ELECT | \-\-leader-elect | Start leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.| -| LOG_LEVEL | \-\-log-level | Log verbosity level. Can be one of 'debug', 'info', or 'error' (default = info)| -| MEMORY_LIMIT | \-\-memory-limit | Memory limit on the container running the controller. The GC soft memory limit is set to 90% of this value. (default = -1)| -| METRICS_PORT | \-\-metrics-port | The port the metric endpoint binds to for operating metrics about the controller itself (default = 8000)| -| RESERVED_ENIS | \-\-reserved-enis | Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html. (default = 0)| -| VM_MEMORY_OVERHEAD_PERCENT | \-\-vm-memory-overhead-percent | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types. (default = 0.075)| -| WEBHOOK_METRICS_PORT | \-\-webhook-metrics-port | The port the webhook metric endpoing binds to for operating metrics about the webhook (default = 8001)| -| WEBHOOK_PORT | \-\-webhook-port | The port the webhook endpoint binds to for validation and mutation of resources (default = 8443)| +| Environment Variable | CLI Flag | Description | +|--|--|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ASSUME_ROLE_ARN | \-\-assume-role-arn | Role to assume for calling AWS services. | +| ASSUME_ROLE_DURATION | \-\-assume-role-duration | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRole set. (default = 15m0s) | +| BATCH_IDLE_DURATION | \-\-batch-idle-duration | The maximum amount of time with no new pending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. (default = 1s) | +| BATCH_MAX_DURATION | \-\-batch-max-duration | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. (default = 10s) | +| CLUSTER_CA_BUNDLE | \-\-cluster-ca-bundle | Cluster CA bundle for nodes to use for TLS connections with the API server. If not set, this is taken from the controller's TLS configuration. | +| CLUSTER_ENDPOINT | \-\-cluster-endpoint | The external kubernetes cluster endpoint for new nodes to connect with. If not specified, will discover the cluster endpoint using DescribeCluster API. | +| CLUSTER_NAME | \-\-cluster-name | [REQUIRED] The kubernetes cluster name for resource discovery. | +| DISABLE_WEBHOOK | \-\-disable-webhook | Disable the admission and validation webhooks | +| ENABLE_PROFILING | \-\-enable-profiling | Enable the profiling on the metric endpoint | +| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift,SpotToSpotConsolidation (default = Drift=true,SpotToSpotConsolidation=false) | +| HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081) | +| INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. | +| ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS pricing endpoint. | +| KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate | +| KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300) | +| KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200) | +| LEADER_ELECT | \-\-leader-elect | Start leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability. | +| LOG_LEVEL | \-\-log-level | Log verbosity level. Can be one of 'debug', 'info', or 'error' (default = info) | +| MEMORY_LIMIT | \-\-memory-limit | Memory limit on the container running the controller. The GC soft memory limit is set to 90% of this value. (default = -1) | +| METRICS_PORT | \-\-metrics-port | The port the metric endpoint binds to for operating metrics about the controller itself (default = 8000) | +| RESERVED_ENIS | \-\-reserved-enis | Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html. (default = 0) | +| VM_MEMORY_OVERHEAD_PERCENT | \-\-vm-memory-overhead-percent | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types. (default = 0.075) | +| WEBHOOK_METRICS_PORT | \-\-webhook-metrics-port | The port the webhook metric endpoing binds to for operating metrics about the webhook (default = 8001) | +| WEBHOOK_PORT | \-\-webhook-port | The port the webhook endpoint binds to for validation and mutation of resources (default = 8443) | [comment]: <> (end docs generated content from hack/docs/configuration_gen_docs.go) diff --git a/website/content/en/preview/reference/settings.md b/website/content/en/preview/reference/settings.md index 4150586483ea..753e9ef7b707 100644 --- a/website/content/en/preview/reference/settings.md +++ b/website/content/en/preview/reference/settings.md @@ -24,7 +24,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf | FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift,SpotToSpotConsolidation (default = Drift=true,SpotToSpotConsolidation=false)| | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)| | INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.| -| ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS pricing endpoint.| +| ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.| | KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate| | KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300)| | KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200)|