Skip to content

Commit

Permalink
fix: Fix failure launching windows pod when instance type isn't in vp…
Browse files Browse the repository at this point in the history
…c resource controller config (aws#6415)
  • Loading branch information
jonathan-innis authored and LPetro committed Jul 4, 2024
1 parent a3d8ac4 commit e19884e
Show file tree
Hide file tree
Showing 10 changed files with 196 additions and 71 deletions.
2 changes: 1 addition & 1 deletion hack/codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ instanceTypeTestData() {
GENERATED_FILE="pkg/fake/zz_generated.describe_instance_types.go"

go run hack/code/instancetype_testdata_gen/main.go --out-file ${GENERATED_FILE} \
--instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium
--instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium,g4ad.16xlarge

checkForUpdates "${GENERATED_FILE}"
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/fake/ec2api.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,14 @@ func (e *EC2API) DescribeInstanceTypeOfferingsWithContext(_ context.Context, _ *
InstanceType: aws.String("g4dn.8xlarge"),
Location: aws.String("test-zone-1b"),
},
{
InstanceType: aws.String("g4ad.16xlarge"),
Location: aws.String("test-zone-1a"),
},
{
InstanceType: aws.String("g4ad.16xlarge"),
Location: aws.String("test-zone-1b"),
},
{
InstanceType: aws.String("t3.large"),
Location: aws.String("test-zone-1a"),
Expand Down
59 changes: 59 additions & 0 deletions pkg/fake/zz_generated.describe_instance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,65 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{
},
},
},
{
InstanceType: aws.String("g4ad.16xlarge"),
SupportedUsageClasses: aws.StringSlice([]string{"on-demand", "spot"}),
SupportedVirtualizationTypes: aws.StringSlice([]string{"hvm"}),
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Hypervisor: aws.String("nitro"),
ProcessorInfo: &ec2.ProcessorInfo{
Manufacturer: aws.String("AMD"),
SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
},
VCpuInfo: &ec2.VCpuInfo{
DefaultCores: aws.Int64(32),
DefaultVCpus: aws.Int64(64),
},
MemoryInfo: &ec2.MemoryInfo{
SizeInMiB: aws.Int64(262144),
},
EbsInfo: &ec2.EbsInfo{
EbsOptimizedInfo: &ec2.EbsOptimizedInfo{
BaselineBandwidthInMbps: aws.Int64(6300),
BaselineIops: aws.Int64(26667),
BaselineThroughputInMBps: aws.Float64(787.50),
MaximumBandwidthInMbps: aws.Int64(6300),
MaximumIops: aws.Int64(26667),
MaximumThroughputInMBps: aws.Float64(787.50),
},
EbsOptimizedSupport: aws.String("default"),
EncryptionSupport: aws.String("supported"),
NvmeSupport: aws.String("required"),
},
GpuInfo: &ec2.GpuInfo{
Gpus: []*ec2.GpuDeviceInfo{
{
Name: aws.String("Radeon Pro V520"),
Manufacturer: aws.String("AMD"),
Count: aws.Int64(4),
MemoryInfo: &ec2.GpuDeviceMemoryInfo{
SizeInMiB: aws.Int64(8192),
},
},
},
},
InstanceStorageInfo: &ec2.InstanceStorageInfo{NvmeSupport: aws.String("required"),
TotalSizeInGB: aws.Int64(2400),
},
NetworkInfo: &ec2.NetworkInfo{
MaximumNetworkInterfaces: aws.Int64(8),
Ipv4AddressesPerInterface: aws.Int64(30),
EncryptionInTransitSupported: aws.Bool(true),
DefaultNetworkCardIndex: aws.Int64(0),
NetworkCards: []*ec2.NetworkCardInfo{
{
NetworkCardIndex: aws.Int64(0),
MaximumNetworkInterfaces: aws.Int64(8),
},
},
},
},
{
InstanceType: aws.String("g4dn.8xlarge"),
SupportedUsageClasses: aws.StringSlice([]string{"on-demand", "spot"}),
Expand Down
116 changes: 111 additions & 5 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ var _ = Describe("InstanceTypeProvider", func() {
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectScheduled(ctx, env.Client, pod)
})
It("should launch AWS Pod ENI on a compatible instance type", func() {
It("should launch vpc.amazonaws.com/pod-eni on a compatible instance type", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Expand All @@ -615,7 +615,80 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(supportsPodENI()).To(Equal(true))
})
It("should launch instances for Nvidia GPU resource requests", func() {
It("should launch vpc.amazonaws.com/PrivateIPv4Address on a compatible instance type", func() {
nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKey(v1.LabelInstanceTypeStable))
limits, ok := instancetype.Limits[node.Labels[v1.LabelInstanceTypeStable]]
Expect(ok).To(BeTrue())
Expect(limits.IPv4PerInterface).ToNot(BeZero())
})
It("should not launch instance type for vpc.amazonaws.com/PrivateIPv4Address if VPC resource controller doesn't advertise it", func() {
// Create a "test" instance type that has PrivateIPv4Addresses but isn't advertised in the VPC limits config
awsEnv.EC2API.DescribeInstanceTypesOutput.Set(&ec2.DescribeInstanceTypesOutput{
InstanceTypes: []*ec2.InstanceTypeInfo{
{
InstanceType: aws.String("test"),
ProcessorInfo: &ec2.ProcessorInfo{
SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
},
VCpuInfo: &ec2.VCpuInfo{
DefaultCores: aws.Int64(1),
DefaultVCpus: aws.Int64(2),
},
MemoryInfo: &ec2.MemoryInfo{
SizeInMiB: aws.Int64(8192),
},
NetworkInfo: &ec2.NetworkInfo{
Ipv4AddressesPerInterface: aws.Int64(10),
DefaultNetworkCardIndex: aws.Int64(0),
NetworkCards: []*ec2.NetworkCardInfo{{
NetworkCardIndex: lo.ToPtr(int64(0)),
MaximumNetworkInterfaces: aws.Int64(3),
}},
},
SupportedUsageClasses: fake.DefaultSupportedUsageClasses,
},
},
})
awsEnv.EC2API.DescribeInstanceTypeOfferingsOutput.Set(&ec2.DescribeInstanceTypeOfferingsOutput{
InstanceTypeOfferings: []*ec2.InstanceTypeOffering{
{
InstanceType: aws.String("test"),
Location: aws.String("test-zone-1a"),
},
},
})
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())

nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1.LabelInstanceTypeStable,
Operator: v1.NodeSelectorOpIn,
Values: []string{"test"},
},
})
nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectNotScheduled(ctx, env.Client, pod)
})
It("should launch instances for nvidia.com/gpu resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -648,7 +721,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should launch instances for Habana GPU resource requests", func() {
It("should launch instances for habana.ai/gaudi resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -679,7 +752,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(1))
})
It("should launch instances for AWS Neuron resource requests", func() {
It("should launch instances for aws.amazon.com/neuron resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -712,7 +785,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should launch trn1 instances for AWS Neuron resource requests", func() {
It("should launch trn1 instances for aws.amazon.com/neuron resource requests", func() {
nodeNames := sets.NewString()
nodePool.Spec.Template.Spec.Requirements = []corev1beta1.NodeSelectorRequirementWithMinValues{
{
Expand Down Expand Up @@ -774,6 +847,39 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodes.Len()).To(Equal(1))
})
It("should launch instances for amd.com/gpu resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
},
}),
// Should pack onto same instance
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
},
}),
// Should pack onto a separate instance
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
},
}),
}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
for _, pod := range pods {
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "g4ad.16xlarge"))
nodeNames.Insert(node.Name)
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should not launch instances w/ instance storage for ephemeral storage resource requests when exceeding blockDeviceMapping", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
Expand Down
16 changes: 10 additions & 6 deletions pkg/providers/instancetype/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func NewInstanceType(ctx context.Context, info *ec2.InstanceTypeInfo, region str
},
}
if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(v1.LabelOSStable, v1.NodeSelectorOpIn, string(v1.Windows)))) == nil {
it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(info)
it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(aws.StringValue(info.InstanceType))
}
return it
}
Expand Down Expand Up @@ -263,9 +263,10 @@ func ephemeralStorage(info *ec2.InstanceTypeInfo, amiFamily amifamily.AMIFamily,
return amifamily.DefaultEBS.VolumeSize
}

func awsPodENI(name string) *resource.Quantity {
// awsPodENI relies on the VPC resource controller to populate the vpc.amazonaws.com/pod-eni resource
func awsPodENI(instanceTypeName string) *resource.Quantity {
// https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html#supported-instance-types
limits, ok := Limits[name]
limits, ok := Limits[instanceTypeName]
if ok && limits.IsTrunkingCompatible {
return resources.Quantity(fmt.Sprint(limits.BranchInterface))
}
Expand Down Expand Up @@ -350,10 +351,13 @@ func ENILimitedPods(ctx context.Context, info *ec2.InstanceTypeInfo) *resource.Q
return resources.Quantity(fmt.Sprint(usableNetworkInterfaces*(addressesPerInterface-1) + 2))
}

func privateIPv4Address(info *ec2.InstanceTypeInfo) *resource.Quantity {
func privateIPv4Address(instanceTypeName string) *resource.Quantity {
//https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/ecbd6965a0100d9a070110233762593b16023287/pkg/provider/ip/provider.go#L297
capacity := aws.Int64Value(info.NetworkInfo.Ipv4AddressesPerInterface) - 1
return resources.Quantity(fmt.Sprint(capacity))
limits, ok := Limits[instanceTypeName]
if !ok {
return resources.Quantity("0")
}
return resources.Quantity(fmt.Sprint(limits.IPv4PerInterface - 1))
}

func systemReservedResources(systemReserved map[string]string) v1.ResourceList {
Expand Down
5 changes: 0 additions & 5 deletions test/pkg/environment/aws/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ var WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9"

var EphemeralInitContainerImage = "alpine"

// ExcludedInstanceFamilies denotes instance families that have issues during resource registration due to compatibility
// issues with versions of the VPR Resource Controller.
// TODO: jmdeal@ remove a1 from exclusion list once Karpenter implicitly filters a1 instances for AL2023 AMI family (incompatible)
var ExcludedInstanceFamilies = []string{"m7a", "r7a", "c7a", "r7i", "a1", "c7i-flex"}

type Environment struct {
*common.Environment
Region string
Expand Down
30 changes: 5 additions & 25 deletions test/suites/ami/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,17 @@ limitations under the License.
package ami_test

import (
"testing"

awssdk "github.com/aws/aws-sdk-go/aws"

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"encoding/base64"
"fmt"
"os"
"strings"
"testing"
"time"

awssdk "github.com/aws/aws-sdk-go/aws"

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"

. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -194,17 +193,6 @@ var _ = Describe("AMI", func() {
return v1beta1.AMISelectorTerm{ID: env.GetAMIBySSMPath(ssmPath)}
})
}
// TODO: remove requirements after Ubuntu fixes bootstrap script issue w/
// new instance types not included in the max-pods.txt file. (https://github.com/aws/karpenter-provider-aws/issues/4472)
nodePool = coretest.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: environmentaws.ExcludedInstanceFamilies,
},
},
)
pod := coretest.Pod()
env.ExpectCreated(nodeClass, nodePool, pod)
env.EventuallyExpectHealthy(pod)
Expand Down Expand Up @@ -343,15 +331,7 @@ var _ = Describe("AMI", func() {
nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}}
nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}}

// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
nodePool = coretest.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: environmentaws.ExcludedInstanceFamilies,
},
},
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1.LabelOSStable,
Expand Down
15 changes: 2 additions & 13 deletions test/suites/integration/extended_resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ import (

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
awsenv "github.com/aws/karpenter-provider-aws/test/pkg/environment/aws"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
)

var _ = Describe("Extended Resources", func() {
Expand Down Expand Up @@ -112,16 +111,6 @@ var _ = Describe("Extended Resources", func() {
DeferCleanup(func() {
env.ExpectPodENIDisabled()
})
// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
test.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: awsenv.ExcludedInstanceFamilies,
},
},
)
numPods := 1
dep := test.Deployment(test.DeploymentOptions{
Replicas: int32(numPods),
Expand Down
Loading

0 comments on commit e19884e

Please sign in to comment.