Fix failure launching windows pod when instance type isn't in vpc res…

…ource controller config
aws · Jun 27, 2024 · 281c614 · 281c614
1 parent d37e5a2
commit 281c614
Show file tree

Hide file tree

Showing 10 changed files with 194 additions and 69 deletions.
diff --git a/hack/codegen.sh b/hack/codegen.sh
@@ -46,7 +46,7 @@ instanceTypeTestData() {
   GENERATED_FILE="pkg/fake/zz_generated.describe_instance_types.go"
 
   go run hack/code/instancetype_testdata_gen/main.go --out-file ${GENERATED_FILE} \
-    --instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium
+    --instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium,g4ad.16xlarge
 
   checkForUpdates "${GENERATED_FILE}"
 }

diff --git a/pkg/fake/ec2api.go b/pkg/fake/ec2api.go
@@ -614,6 +614,14 @@ func (e *EC2API) DescribeInstanceTypeOfferingsWithContext(_ context.Context, _ *
 				InstanceType: aws.String("g4dn.8xlarge"),
 				Location:     aws.String("test-zone-1b"),
 			},
+			{
+				InstanceType: aws.String("g4ad.16xlarge"),
+				Location:     aws.String("test-zone-1a"),
+			},
+			{
+				InstanceType: aws.String("g4ad.16xlarge"),
+				Location:     aws.String("test-zone-1b"),
+			},
 			{
 				InstanceType: aws.String("t3.large"),
 				Location:     aws.String("test-zone-1a"),

diff --git a/pkg/fake/zz_generated.describe_instance_types.go b/pkg/fake/zz_generated.describe_instance_types.go
@@ -145,6 +145,65 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{
 				},
 			},
 		},
+		{
+			InstanceType:                  aws.String("g4ad.16xlarge"),
+			SupportedUsageClasses:         aws.StringSlice([]string{"on-demand", "spot"}),
+			SupportedVirtualizationTypes:  aws.StringSlice([]string{"hvm"}),
+			BurstablePerformanceSupported: aws.Bool(false),
+			BareMetal:                     aws.Bool(false),
+			Hypervisor:                    aws.String("nitro"),
+			ProcessorInfo: &ec2.ProcessorInfo{
+				Manufacturer:           aws.String("AMD"),
+				SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
+			},
+			VCpuInfo: &ec2.VCpuInfo{
+				DefaultCores: aws.Int64(32),
+				DefaultVCpus: aws.Int64(64),
+			},
+			MemoryInfo: &ec2.MemoryInfo{
+				SizeInMiB: aws.Int64(262144),
+			},
+			EbsInfo: &ec2.EbsInfo{
+				EbsOptimizedInfo: &ec2.EbsOptimizedInfo{
+					BaselineBandwidthInMbps:  aws.Int64(6300),
+					BaselineIops:             aws.Int64(26667),
+					BaselineThroughputInMBps: aws.Float64(787.50),
+					MaximumBandwidthInMbps:   aws.Int64(6300),
+					MaximumIops:              aws.Int64(26667),
+					MaximumThroughputInMBps:  aws.Float64(787.50),
+				},
+				EbsOptimizedSupport: aws.String("default"),
+				EncryptionSupport:   aws.String("supported"),
+				NvmeSupport:         aws.String("required"),
+			},
+			GpuInfo: &ec2.GpuInfo{
+				Gpus: []*ec2.GpuDeviceInfo{
+					{
+						Name:         aws.String("Radeon Pro V520"),
+						Manufacturer: aws.String("AMD"),
+						Count:        aws.Int64(4),
+						MemoryInfo: &ec2.GpuDeviceMemoryInfo{
+							SizeInMiB: aws.Int64(8192),
+						},
+					},
+				},
+			},
+			InstanceStorageInfo: &ec2.InstanceStorageInfo{NvmeSupport: aws.String("required"),
+				TotalSizeInGB: aws.Int64(2400),
+			},
+			NetworkInfo: &ec2.NetworkInfo{
+				MaximumNetworkInterfaces:     aws.Int64(8),
+				Ipv4AddressesPerInterface:    aws.Int64(30),
+				EncryptionInTransitSupported: aws.Bool(true),
+				DefaultNetworkCardIndex:      aws.Int64(0),
+				NetworkCards: []*ec2.NetworkCardInfo{
+					{
+						NetworkCardIndex:         aws.Int64(0),
+						MaximumNetworkInterfaces: aws.Int64(8),
+					},
+				},
+			},
+		},
 		{
 			InstanceType:                  aws.String("g4dn.8xlarge"),
 			SupportedUsageClasses:         aws.StringSlice([]string{"on-demand", "spot"}),

diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go
@@ -598,7 +598,7 @@ var _ = Describe("InstanceTypeProvider", func() {
 		ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
 		ExpectScheduled(ctx, env.Client, pod)
 	})
-	It("should launch AWS Pod ENI on a compatible instance type", func() {
+	It("should launch vpc.amazonaws.com/pod-eni on a compatible instance type", func() {
 		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
 		pod := coretest.UnschedulablePod(coretest.PodOptions{
 			ResourceRequirements: v1.ResourceRequirements{
@@ -615,7 +615,80 @@ var _ = Describe("InstanceTypeProvider", func() {
 		}
 		Expect(supportsPodENI()).To(Equal(true))
 	})
-	It("should launch instances for Nvidia GPU resource requests", func() {
+	It("should launch vpc.amazonaws.com/PrivateIPv4Address on a compatible instance type", func() {
+		nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
+		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
+		pod := coretest.UnschedulablePod(coretest.PodOptions{
+			ResourceRequirements: v1.ResourceRequirements{
+				Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
+				Limits:   v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
+			},
+		})
+		ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
+		node := ExpectScheduled(ctx, env.Client, pod)
+		Expect(node.Labels).To(HaveKey(v1.LabelInstanceTypeStable))
+		limits, ok := instancetype.Limits[node.Labels[v1.LabelInstanceTypeStable]]
+		Expect(ok).To(BeTrue())
+		Expect(limits.IPv4PerInterface).ToNot(BeZero())
+	})
+	It("should not launch instance type for vpc.amazonaws.com/PrivateIPv4Address if VPC resource controller doesn't advertise it", func() {
+		// Create a "test" instance type that has PrivateIPv4Addresses but isn't advertised in the VPC limits config
+		awsEnv.EC2API.DescribeInstanceTypesOutput.Set(&ec2.DescribeInstanceTypesOutput{
+			InstanceTypes: []*ec2.InstanceTypeInfo{
+				{
+					InstanceType: aws.String("test"),
+					ProcessorInfo: &ec2.ProcessorInfo{
+						SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
+					},
+					VCpuInfo: &ec2.VCpuInfo{
+						DefaultCores: aws.Int64(1),
+						DefaultVCpus: aws.Int64(2),
+					},
+					MemoryInfo: &ec2.MemoryInfo{
+						SizeInMiB: aws.Int64(8192),
+					},
+					NetworkInfo: &ec2.NetworkInfo{
+						Ipv4AddressesPerInterface: aws.Int64(10),
+						DefaultNetworkCardIndex:   aws.Int64(0),
+						NetworkCards: []*ec2.NetworkCardInfo{{
+							NetworkCardIndex:         lo.ToPtr(int64(0)),
+							MaximumNetworkInterfaces: aws.Int64(3),
+						}},
+					},
+					SupportedUsageClasses: fake.DefaultSupportedUsageClasses,
+				},
+			},
+		})
+		awsEnv.EC2API.DescribeInstanceTypeOfferingsOutput.Set(&ec2.DescribeInstanceTypeOfferingsOutput{
+			InstanceTypeOfferings: []*ec2.InstanceTypeOffering{
+				{
+					InstanceType: aws.String("test"),
+					Location:     aws.String("test-zone-1a"),
+				},
+			},
+		})
+		Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
+		Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())
+
+		nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, corev1beta1.NodeSelectorRequirementWithMinValues{
+			NodeSelectorRequirement: v1.NodeSelectorRequirement{
+				Key:      v1.LabelInstanceTypeStable,
+				Operator: v1.NodeSelectorOpIn,
+				Values:   []string{"test"},
+			},
+		})
+		nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
+		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
+		pod := coretest.UnschedulablePod(coretest.PodOptions{
+			ResourceRequirements: v1.ResourceRequirements{
+				Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
+				Limits:   v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
+			},
+		})
+		ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
+		ExpectNotScheduled(ctx, env.Client, pod)
+	})
+	It("should launch instances for nvidia.com/gpu resource requests", func() {
 		nodeNames := sets.NewString()
 		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
 		pods := []*v1.Pod{
@@ -648,7 +721,7 @@ var _ = Describe("InstanceTypeProvider", func() {
 		}
 		Expect(nodeNames.Len()).To(Equal(2))
 	})
-	It("should launch instances for Habana GPU resource requests", func() {
+	It("should launch instances for habana.ai/gaudi resource requests", func() {
 		nodeNames := sets.NewString()
 		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
 		pods := []*v1.Pod{
@@ -679,7 +752,7 @@ var _ = Describe("InstanceTypeProvider", func() {
 		}
 		Expect(nodeNames.Len()).To(Equal(1))
 	})
-	It("should launch instances for AWS Neuron resource requests", func() {
+	It("should launch instances for aws.amazon.com/neuron resource requests", func() {
 		nodeNames := sets.NewString()
 		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
 		pods := []*v1.Pod{
@@ -712,7 +785,7 @@ var _ = Describe("InstanceTypeProvider", func() {
 		}
 		Expect(nodeNames.Len()).To(Equal(2))
 	})
-	It("should launch trn1 instances for AWS Neuron resource requests", func() {
+	It("should launch trn1 instances for aws.amazon.com/neuron resource requests", func() {
 		nodeNames := sets.NewString()
 		nodePool.Spec.Template.Spec.Requirements = []corev1beta1.NodeSelectorRequirementWithMinValues{
 			{
@@ -774,6 +847,39 @@ var _ = Describe("InstanceTypeProvider", func() {
 		}
 		Expect(nodes.Len()).To(Equal(1))
 	})
+	It("should launch instances for amd.com/gpu resource requests", func() {
+		nodeNames := sets.NewString()
+		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
+		pods := []*v1.Pod{
+			coretest.UnschedulablePod(coretest.PodOptions{
+				ResourceRequirements: v1.ResourceRequirements{
+					Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
+					Limits:   v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
+				},
+			}),
+			// Should pack onto same instance
+			coretest.UnschedulablePod(coretest.PodOptions{
+				ResourceRequirements: v1.ResourceRequirements{
+					Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
+					Limits:   v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
+				},
+			}),
+			// Should pack onto a separate instance
+			coretest.UnschedulablePod(coretest.PodOptions{
+				ResourceRequirements: v1.ResourceRequirements{
+					Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
+					Limits:   v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
+				},
+			}),
+		}
+		ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
+		for _, pod := range pods {
+			node := ExpectScheduled(ctx, env.Client, pod)
+			Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "g4ad.16xlarge"))
+			nodeNames.Insert(node.Name)
+		}
+		Expect(nodeNames.Len()).To(Equal(2))
+	})
 	It("should not launch instances w/ instance storage for ephemeral storage resource requests when exceeding blockDeviceMapping", func() {
 		ExpectApplied(ctx, env.Client, nodePool, nodeClass)
 		pod := coretest.UnschedulablePod(coretest.PodOptions{

diff --git a/pkg/providers/instancetype/types.go b/pkg/providers/instancetype/types.go
@@ -65,7 +65,7 @@ func NewInstanceType(ctx context.Context, info *ec2.InstanceTypeInfo, region str
 		},
 	}
 	if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(v1.LabelOSStable, v1.NodeSelectorOpIn, string(v1.Windows)))) == nil {
-		it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(info)
+		it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(aws.StringValue(info.InstanceType))
 	}
 	return it
 }
@@ -263,6 +263,7 @@ func ephemeralStorage(info *ec2.InstanceTypeInfo, amiFamily amifamily.AMIFamily,
 	return amifamily.DefaultEBS.VolumeSize
 }
 
+// awsPodENI relies on the VPC resource controller to populate the vpc.amazonaws.com/pod-eni resource
 func awsPodENI(name string) *resource.Quantity {
 	// https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html#supported-instance-types
 	limits, ok := Limits[name]
@@ -350,10 +351,13 @@ func ENILimitedPods(ctx context.Context, info *ec2.InstanceTypeInfo) *resource.Q
 	return resources.Quantity(fmt.Sprint(usableNetworkInterfaces*(addressesPerInterface-1) + 2))
 }
 
-func privateIPv4Address(info *ec2.InstanceTypeInfo) *resource.Quantity {
+func privateIPv4Address(name string) *resource.Quantity {
 	//https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/ecbd6965a0100d9a070110233762593b16023287/pkg/provider/ip/provider.go#L297
-	capacity := aws.Int64Value(info.NetworkInfo.Ipv4AddressesPerInterface) - 1
-	return resources.Quantity(fmt.Sprint(capacity))
+	limits, ok := Limits[name]
+	if !ok {
+		return resources.Quantity("0")
+	}
+	return resources.Quantity(fmt.Sprint(limits.IPv4PerInterface - 1))
 }
 
 func systemReservedResources(systemReserved map[string]string) v1.ResourceList {

diff --git a/test/pkg/environment/aws/environment.go b/test/pkg/environment/aws/environment.go
@@ -56,11 +56,6 @@ var WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9"
 
 var EphemeralInitContainerImage = "alpine"
 
-// ExcludedInstanceFamilies denotes instance families that have issues during resource registration due to compatibility
-// issues with versions of the VPR Resource Controller.
-// TODO: jmdeal@ remove a1 from exclusion list once Karpenter implicitly filters a1 instances for AL2023 AMI family (incompatible)
-var ExcludedInstanceFamilies = []string{"m7a", "r7a", "c7a", "r7i", "a1", "c7i-flex"}
-
 type Environment struct {
 	*common.Environment
 	Region string

diff --git a/test/suites/ami/suite_test.go b/test/suites/ami/suite_test.go
@@ -15,18 +15,17 @@ limitations under the License.
 package ami_test
 
 import (
-	"testing"
-
-	awssdk "github.com/aws/aws-sdk-go/aws"
-
-	corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"
-
 	"encoding/base64"
 	"fmt"
 	"os"
 	"strings"
+	"testing"
 	"time"
 
+	awssdk "github.com/aws/aws-sdk-go/aws"
+
+	corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"
+
 	"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -194,17 +193,6 @@ var _ = Describe("AMI", func() {
 					return v1beta1.AMISelectorTerm{ID: env.GetAMIBySSMPath(ssmPath)}
 				})
 			}
-			// TODO: remove requirements after Ubuntu fixes bootstrap script issue w/
-			// new instance types not included in the max-pods.txt file. (https://github.com/aws/karpenter-provider-aws/issues/4472)
-			nodePool = coretest.ReplaceRequirements(nodePool,
-				corev1beta1.NodeSelectorRequirementWithMinValues{
-					NodeSelectorRequirement: v1.NodeSelectorRequirement{
-						Key:      v1beta1.LabelInstanceFamily,
-						Operator: v1.NodeSelectorOpNotIn,
-						Values:   environmentaws.ExcludedInstanceFamilies,
-					},
-				},
-			)
 			pod := coretest.Pod()
 			env.ExpectCreated(nodeClass, nodePool, pod)
 			env.EventuallyExpectHealthy(pod)
@@ -343,15 +331,7 @@ var _ = Describe("AMI", func() {
 			nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}}
 			nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}}
 
-			// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
 			nodePool = coretest.ReplaceRequirements(nodePool,
-				corev1beta1.NodeSelectorRequirementWithMinValues{
-					NodeSelectorRequirement: v1.NodeSelectorRequirement{
-						Key:      v1beta1.LabelInstanceFamily,
-						Operator: v1.NodeSelectorOpNotIn,
-						Values:   environmentaws.ExcludedInstanceFamilies,
-					},
-				},
 				corev1beta1.NodeSelectorRequirementWithMinValues{
 					NodeSelectorRequirement: v1.NodeSelectorRequirement{
 						Key:      v1.LabelOSStable,

diff --git a/test/suites/integration/extended_resources_test.go b/test/suites/integration/extended_resources_test.go
@@ -30,11 +30,10 @@ import (
 
 	corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"
 
-	"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
-	awsenv "github.com/aws/karpenter-provider-aws/test/pkg/environment/aws"
-
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+
+	"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
 )
 
 var _ = Describe("Extended Resources", func() {
@@ -112,16 +111,6 @@ var _ = Describe("Extended Resources", func() {
 		DeferCleanup(func() {
 			env.ExpectPodENIDisabled()
 		})
-		// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
-		test.ReplaceRequirements(nodePool,
-			corev1beta1.NodeSelectorRequirementWithMinValues{
-				NodeSelectorRequirement: v1.NodeSelectorRequirement{
-					Key:      v1beta1.LabelInstanceFamily,
-					Operator: v1.NodeSelectorOpNotIn,
-					Values:   awsenv.ExcludedInstanceFamilies,
-				},
-			},
-		)
 		numPods := 1
 		dep := test.Deployment(test.DeploymentOptions{
 			Replicas: int32(numPods),

diff --git a/test/suites/integration/kubelet_config_test.go b/test/suites/integration/kubelet_config_test.go
@@ -121,15 +121,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() {
 				// Need to enable nodepool-level OS-scoping for now since DS evaluation is done off of the nodepool
 				// requirements, not off of the instance type options so scheduling can fail if nodepool aren't
 				// properly scoped
-				// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.*, c7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
 				test.ReplaceRequirements(nodePool,
-					corev1beta1.NodeSelectorRequirementWithMinValues{
-						NodeSelectorRequirement: v1.NodeSelectorRequirement{
-							Key:      v1beta1.LabelInstanceFamily,
-							Operator: v1.NodeSelectorOpNotIn,
-							Values:   aws.ExcludedInstanceFamilies,
-						},
-					},
 					corev1beta1.NodeSelectorRequirementWithMinValues{
 						NodeSelectorRequirement: v1.NodeSelectorRequirement{
 							Key:      v1.LabelOSStable,