Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix failure launching windows pod when instance type isn't in vpc resource controller config #6415

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hack/codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ instanceTypeTestData() {
GENERATED_FILE="pkg/fake/zz_generated.describe_instance_types.go"

go run hack/code/instancetype_testdata_gen/main.go --out-file ${GENERATED_FILE} \
--instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium
--instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium,g4ad.16xlarge

checkForUpdates "${GENERATED_FILE}"
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/fake/ec2api.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,14 @@ func (e *EC2API) DescribeInstanceTypeOfferingsWithContext(_ context.Context, _ *
InstanceType: aws.String("g4dn.8xlarge"),
Location: aws.String("test-zone-1b"),
},
{
InstanceType: aws.String("g4ad.16xlarge"),
Location: aws.String("test-zone-1a"),
},
{
InstanceType: aws.String("g4ad.16xlarge"),
Location: aws.String("test-zone-1b"),
},
{
InstanceType: aws.String("t3.large"),
Location: aws.String("test-zone-1a"),
Expand Down
59 changes: 59 additions & 0 deletions pkg/fake/zz_generated.describe_instance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,65 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{
},
},
},
{
InstanceType: aws.String("g4ad.16xlarge"),
SupportedUsageClasses: aws.StringSlice([]string{"on-demand", "spot"}),
SupportedVirtualizationTypes: aws.StringSlice([]string{"hvm"}),
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Hypervisor: aws.String("nitro"),
ProcessorInfo: &ec2.ProcessorInfo{
Manufacturer: aws.String("AMD"),
SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
},
VCpuInfo: &ec2.VCpuInfo{
DefaultCores: aws.Int64(32),
DefaultVCpus: aws.Int64(64),
},
MemoryInfo: &ec2.MemoryInfo{
SizeInMiB: aws.Int64(262144),
},
EbsInfo: &ec2.EbsInfo{
EbsOptimizedInfo: &ec2.EbsOptimizedInfo{
BaselineBandwidthInMbps: aws.Int64(6300),
BaselineIops: aws.Int64(26667),
BaselineThroughputInMBps: aws.Float64(787.50),
MaximumBandwidthInMbps: aws.Int64(6300),
MaximumIops: aws.Int64(26667),
MaximumThroughputInMBps: aws.Float64(787.50),
},
EbsOptimizedSupport: aws.String("default"),
EncryptionSupport: aws.String("supported"),
NvmeSupport: aws.String("required"),
},
GpuInfo: &ec2.GpuInfo{
Gpus: []*ec2.GpuDeviceInfo{
{
Name: aws.String("Radeon Pro V520"),
Manufacturer: aws.String("AMD"),
Count: aws.Int64(4),
MemoryInfo: &ec2.GpuDeviceMemoryInfo{
SizeInMiB: aws.Int64(8192),
},
},
},
},
InstanceStorageInfo: &ec2.InstanceStorageInfo{NvmeSupport: aws.String("required"),
TotalSizeInGB: aws.Int64(2400),
},
NetworkInfo: &ec2.NetworkInfo{
MaximumNetworkInterfaces: aws.Int64(8),
Ipv4AddressesPerInterface: aws.Int64(30),
EncryptionInTransitSupported: aws.Bool(true),
DefaultNetworkCardIndex: aws.Int64(0),
NetworkCards: []*ec2.NetworkCardInfo{
{
NetworkCardIndex: aws.Int64(0),
MaximumNetworkInterfaces: aws.Int64(8),
},
},
},
},
{
InstanceType: aws.String("g4dn.8xlarge"),
SupportedUsageClasses: aws.StringSlice([]string{"on-demand", "spot"}),
Expand Down
116 changes: 111 additions & 5 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ var _ = Describe("InstanceTypeProvider", func() {
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectScheduled(ctx, env.Client, pod)
})
It("should launch AWS Pod ENI on a compatible instance type", func() {
It("should launch vpc.amazonaws.com/pod-eni on a compatible instance type", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Expand All @@ -615,7 +615,80 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(supportsPodENI()).To(Equal(true))
})
It("should launch instances for Nvidia GPU resource requests", func() {
It("should launch vpc.amazonaws.com/PrivateIPv4Address on a compatible instance type", func() {
nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKey(v1.LabelInstanceTypeStable))
limits, ok := instancetype.Limits[node.Labels[v1.LabelInstanceTypeStable]]
Expect(ok).To(BeTrue())
Expect(limits.IPv4PerInterface).ToNot(BeZero())
})
It("should not launch instance type for vpc.amazonaws.com/PrivateIPv4Address if VPC resource controller doesn't advertise it", func() {
// Create a "test" instance type that has PrivateIPv4Addresses but isn't advertised in the VPC limits config
awsEnv.EC2API.DescribeInstanceTypesOutput.Set(&ec2.DescribeInstanceTypesOutput{
InstanceTypes: []*ec2.InstanceTypeInfo{
{
InstanceType: aws.String("test"),
ProcessorInfo: &ec2.ProcessorInfo{
SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
},
VCpuInfo: &ec2.VCpuInfo{
DefaultCores: aws.Int64(1),
DefaultVCpus: aws.Int64(2),
},
MemoryInfo: &ec2.MemoryInfo{
SizeInMiB: aws.Int64(8192),
},
NetworkInfo: &ec2.NetworkInfo{
Ipv4AddressesPerInterface: aws.Int64(10),
DefaultNetworkCardIndex: aws.Int64(0),
NetworkCards: []*ec2.NetworkCardInfo{{
NetworkCardIndex: lo.ToPtr(int64(0)),
MaximumNetworkInterfaces: aws.Int64(3),
}},
},
SupportedUsageClasses: fake.DefaultSupportedUsageClasses,
},
},
})
awsEnv.EC2API.DescribeInstanceTypeOfferingsOutput.Set(&ec2.DescribeInstanceTypeOfferingsOutput{
InstanceTypeOfferings: []*ec2.InstanceTypeOffering{
{
InstanceType: aws.String("test"),
Location: aws.String("test-zone-1a"),
},
},
})
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())

nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1.LabelInstanceTypeStable,
Operator: v1.NodeSelectorOpIn,
Values: []string{"test"},
},
})
nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectNotScheduled(ctx, env.Client, pod)
})
It("should launch instances for nvidia.com/gpu resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -648,7 +721,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should launch instances for Habana GPU resource requests", func() {
It("should launch instances for habana.ai/gaudi resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -679,7 +752,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(1))
})
It("should launch instances for AWS Neuron resource requests", func() {
It("should launch instances for aws.amazon.com/neuron resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -712,7 +785,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should launch trn1 instances for AWS Neuron resource requests", func() {
It("should launch trn1 instances for aws.amazon.com/neuron resource requests", func() {
nodeNames := sets.NewString()
nodePool.Spec.Template.Spec.Requirements = []corev1beta1.NodeSelectorRequirementWithMinValues{
{
Expand Down Expand Up @@ -774,6 +847,39 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodes.Len()).To(Equal(1))
})
It("should launch instances for amd.com/gpu resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
},
}),
// Should pack onto same instance
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
},
}),
// Should pack onto a separate instance
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
},
}),
}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
for _, pod := range pods {
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "g4ad.16xlarge"))
nodeNames.Insert(node.Name)
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should not launch instances w/ instance storage for ephemeral storage resource requests when exceeding blockDeviceMapping", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
Expand Down
16 changes: 10 additions & 6 deletions pkg/providers/instancetype/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func NewInstanceType(ctx context.Context, info *ec2.InstanceTypeInfo, region str
},
}
if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(v1.LabelOSStable, v1.NodeSelectorOpIn, string(v1.Windows)))) == nil {
it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(info)
it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(aws.StringValue(info.InstanceType))
}
return it
}
Expand Down Expand Up @@ -263,9 +263,10 @@ func ephemeralStorage(info *ec2.InstanceTypeInfo, amiFamily amifamily.AMIFamily,
return amifamily.DefaultEBS.VolumeSize
}

func awsPodENI(name string) *resource.Quantity {
// awsPodENI relies on the VPC resource controller to populate the vpc.amazonaws.com/pod-eni resource
func awsPodENI(instanceTypeName string) *resource.Quantity {
// https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html#supported-instance-types
limits, ok := Limits[name]
limits, ok := Limits[instanceTypeName]
if ok && limits.IsTrunkingCompatible {
return resources.Quantity(fmt.Sprint(limits.BranchInterface))
}
Expand Down Expand Up @@ -350,10 +351,13 @@ func ENILimitedPods(ctx context.Context, info *ec2.InstanceTypeInfo) *resource.Q
return resources.Quantity(fmt.Sprint(usableNetworkInterfaces*(addressesPerInterface-1) + 2))
}

func privateIPv4Address(info *ec2.InstanceTypeInfo) *resource.Quantity {
func privateIPv4Address(instanceTypeName string) *resource.Quantity {
//https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/ecbd6965a0100d9a070110233762593b16023287/pkg/provider/ip/provider.go#L297
capacity := aws.Int64Value(info.NetworkInfo.Ipv4AddressesPerInterface) - 1
return resources.Quantity(fmt.Sprint(capacity))
limits, ok := Limits[instanceTypeName]
if !ok {
return resources.Quantity("0")
}
return resources.Quantity(fmt.Sprint(limits.IPv4PerInterface - 1))
}

func systemReservedResources(systemReserved map[string]string) v1.ResourceList {
Expand Down
5 changes: 0 additions & 5 deletions test/pkg/environment/aws/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ var WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9"

var EphemeralInitContainerImage = "alpine"

// ExcludedInstanceFamilies denotes instance families that have issues during resource registration due to compatibility
// issues with versions of the VPR Resource Controller.
// TODO: jmdeal@ remove a1 from exclusion list once Karpenter implicitly filters a1 instances for AL2023 AMI family (incompatible)
var ExcludedInstanceFamilies = []string{"m7a", "r7a", "c7a", "r7i", "a1", "c7i-flex"}

type Environment struct {
*common.Environment
Region string
Expand Down
30 changes: 5 additions & 25 deletions test/suites/ami/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,17 @@ limitations under the License.
package ami_test

import (
"testing"

awssdk "github.com/aws/aws-sdk-go/aws"

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"encoding/base64"
"fmt"
"os"
"strings"
"testing"
"time"

awssdk "github.com/aws/aws-sdk-go/aws"

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"

. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -194,17 +193,6 @@ var _ = Describe("AMI", func() {
return v1beta1.AMISelectorTerm{ID: env.GetAMIBySSMPath(ssmPath)}
})
}
// TODO: remove requirements after Ubuntu fixes bootstrap script issue w/
// new instance types not included in the max-pods.txt file. (https://github.com/aws/karpenter-provider-aws/issues/4472)
nodePool = coretest.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: environmentaws.ExcludedInstanceFamilies,
},
},
)
pod := coretest.Pod()
env.ExpectCreated(nodeClass, nodePool, pod)
env.EventuallyExpectHealthy(pod)
Expand Down Expand Up @@ -343,15 +331,7 @@ var _ = Describe("AMI", func() {
nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}}
nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}}

// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
nodePool = coretest.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: environmentaws.ExcludedInstanceFamilies,
},
},
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1.LabelOSStable,
Expand Down
15 changes: 2 additions & 13 deletions test/suites/integration/extended_resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ import (

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
awsenv "github.com/aws/karpenter-provider-aws/test/pkg/environment/aws"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
)

var _ = Describe("Extended Resources", func() {
Expand Down Expand Up @@ -112,16 +111,6 @@ var _ = Describe("Extended Resources", func() {
DeferCleanup(func() {
env.ExpectPodENIDisabled()
})
// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
test.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: awsenv.ExcludedInstanceFamilies,
},
},
)
numPods := 1
dep := test.Deployment(test.DeploymentOptions{
Replicas: int32(numPods),
Expand Down
Loading
Loading