Skip to content

Commit

Permalink
Fix failure launching windows pod when instance type isn't in vpc res…
Browse files Browse the repository at this point in the history
…ource controller config
  • Loading branch information
jonathan-innis committed Jun 27, 2024
1 parent d37e5a2 commit 281c614
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 69 deletions.
2 changes: 1 addition & 1 deletion hack/codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ instanceTypeTestData() {
GENERATED_FILE="pkg/fake/zz_generated.describe_instance_types.go"

go run hack/code/instancetype_testdata_gen/main.go --out-file ${GENERATED_FILE} \
--instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium
--instance-types t3.large,m5.large,m5.xlarge,p3.8xlarge,g4dn.8xlarge,c6g.large,inf1.2xlarge,inf1.6xlarge,trn1.2xlarge,m5.metal,dl1.24xlarge,m6idn.32xlarge,t4g.small,t4g.xlarge,t4g.medium,g4ad.16xlarge

checkForUpdates "${GENERATED_FILE}"
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/fake/ec2api.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,14 @@ func (e *EC2API) DescribeInstanceTypeOfferingsWithContext(_ context.Context, _ *
InstanceType: aws.String("g4dn.8xlarge"),
Location: aws.String("test-zone-1b"),
},
{
InstanceType: aws.String("g4ad.16xlarge"),
Location: aws.String("test-zone-1a"),
},
{
InstanceType: aws.String("g4ad.16xlarge"),
Location: aws.String("test-zone-1b"),
},
{
InstanceType: aws.String("t3.large"),
Location: aws.String("test-zone-1a"),
Expand Down
59 changes: 59 additions & 0 deletions pkg/fake/zz_generated.describe_instance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,65 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{
},
},
},
{
InstanceType: aws.String("g4ad.16xlarge"),
SupportedUsageClasses: aws.StringSlice([]string{"on-demand", "spot"}),
SupportedVirtualizationTypes: aws.StringSlice([]string{"hvm"}),
BurstablePerformanceSupported: aws.Bool(false),
BareMetal: aws.Bool(false),
Hypervisor: aws.String("nitro"),
ProcessorInfo: &ec2.ProcessorInfo{
Manufacturer: aws.String("AMD"),
SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
},
VCpuInfo: &ec2.VCpuInfo{
DefaultCores: aws.Int64(32),
DefaultVCpus: aws.Int64(64),
},
MemoryInfo: &ec2.MemoryInfo{
SizeInMiB: aws.Int64(262144),
},
EbsInfo: &ec2.EbsInfo{
EbsOptimizedInfo: &ec2.EbsOptimizedInfo{
BaselineBandwidthInMbps: aws.Int64(6300),
BaselineIops: aws.Int64(26667),
BaselineThroughputInMBps: aws.Float64(787.50),
MaximumBandwidthInMbps: aws.Int64(6300),
MaximumIops: aws.Int64(26667),
MaximumThroughputInMBps: aws.Float64(787.50),
},
EbsOptimizedSupport: aws.String("default"),
EncryptionSupport: aws.String("supported"),
NvmeSupport: aws.String("required"),
},
GpuInfo: &ec2.GpuInfo{
Gpus: []*ec2.GpuDeviceInfo{
{
Name: aws.String("Radeon Pro V520"),
Manufacturer: aws.String("AMD"),
Count: aws.Int64(4),
MemoryInfo: &ec2.GpuDeviceMemoryInfo{
SizeInMiB: aws.Int64(8192),
},
},
},
},
InstanceStorageInfo: &ec2.InstanceStorageInfo{NvmeSupport: aws.String("required"),
TotalSizeInGB: aws.Int64(2400),
},
NetworkInfo: &ec2.NetworkInfo{
MaximumNetworkInterfaces: aws.Int64(8),
Ipv4AddressesPerInterface: aws.Int64(30),
EncryptionInTransitSupported: aws.Bool(true),
DefaultNetworkCardIndex: aws.Int64(0),
NetworkCards: []*ec2.NetworkCardInfo{
{
NetworkCardIndex: aws.Int64(0),
MaximumNetworkInterfaces: aws.Int64(8),
},
},
},
},
{
InstanceType: aws.String("g4dn.8xlarge"),
SupportedUsageClasses: aws.StringSlice([]string{"on-demand", "spot"}),
Expand Down
116 changes: 111 additions & 5 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ var _ = Describe("InstanceTypeProvider", func() {
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectScheduled(ctx, env.Client, pod)
})
It("should launch AWS Pod ENI on a compatible instance type", func() {
It("should launch vpc.amazonaws.com/pod-eni on a compatible instance type", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Expand All @@ -615,7 +615,80 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(supportsPodENI()).To(Equal(true))
})
It("should launch instances for Nvidia GPU resource requests", func() {
It("should launch vpc.amazonaws.com/PrivateIPv4Address on a compatible instance type", func() {
nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKey(v1.LabelInstanceTypeStable))
limits, ok := instancetype.Limits[node.Labels[v1.LabelInstanceTypeStable]]
Expect(ok).To(BeTrue())
Expect(limits.IPv4PerInterface).ToNot(BeZero())
})
It("should not launch instance type for vpc.amazonaws.com/PrivateIPv4Address if VPC resource controller doesn't advertise it", func() {
// Create a "test" instance type that has PrivateIPv4Addresses but isn't advertised in the VPC limits config
awsEnv.EC2API.DescribeInstanceTypesOutput.Set(&ec2.DescribeInstanceTypesOutput{
InstanceTypes: []*ec2.InstanceTypeInfo{
{
InstanceType: aws.String("test"),
ProcessorInfo: &ec2.ProcessorInfo{
SupportedArchitectures: aws.StringSlice([]string{"x86_64"}),
},
VCpuInfo: &ec2.VCpuInfo{
DefaultCores: aws.Int64(1),
DefaultVCpus: aws.Int64(2),
},
MemoryInfo: &ec2.MemoryInfo{
SizeInMiB: aws.Int64(8192),
},
NetworkInfo: &ec2.NetworkInfo{
Ipv4AddressesPerInterface: aws.Int64(10),
DefaultNetworkCardIndex: aws.Int64(0),
NetworkCards: []*ec2.NetworkCardInfo{{
NetworkCardIndex: lo.ToPtr(int64(0)),
MaximumNetworkInterfaces: aws.Int64(3),
}},
},
SupportedUsageClasses: fake.DefaultSupportedUsageClasses,
},
},
})
awsEnv.EC2API.DescribeInstanceTypeOfferingsOutput.Set(&ec2.DescribeInstanceTypeOfferingsOutput{
InstanceTypeOfferings: []*ec2.InstanceTypeOffering{
{
InstanceType: aws.String("test"),
Location: aws.String("test-zone-1a"),
},
},
})
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())

nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1.LabelInstanceTypeStable,
Operator: v1.NodeSelectorOpIn,
Values: []string{"test"},
},
})
nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourcePrivateIPv4Address: resource.MustParse("1")},
},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectNotScheduled(ctx, env.Client, pod)
})
It("should launch instances for nvidia.com/gpu resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -648,7 +721,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should launch instances for Habana GPU resource requests", func() {
It("should launch instances for habana.ai/gaudi resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -679,7 +752,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(1))
})
It("should launch instances for AWS Neuron resource requests", func() {
It("should launch instances for aws.amazon.com/neuron resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
Expand Down Expand Up @@ -712,7 +785,7 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should launch trn1 instances for AWS Neuron resource requests", func() {
It("should launch trn1 instances for aws.amazon.com/neuron resource requests", func() {
nodeNames := sets.NewString()
nodePool.Spec.Template.Spec.Requirements = []corev1beta1.NodeSelectorRequirementWithMinValues{
{
Expand Down Expand Up @@ -774,6 +847,39 @@ var _ = Describe("InstanceTypeProvider", func() {
}
Expect(nodes.Len()).To(Equal(1))
})
It("should launch instances for amd.com/gpu resource requests", func() {
nodeNames := sets.NewString()
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pods := []*v1.Pod{
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("1")},
},
}),
// Should pack onto same instance
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("2")},
},
}),
// Should pack onto a separate instance
coretest.UnschedulablePod(coretest.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
Limits: v1.ResourceList{v1beta1.ResourceAMDGPU: resource.MustParse("4")},
},
}),
}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pods...)
for _, pod := range pods {
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "g4ad.16xlarge"))
nodeNames.Insert(node.Name)
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should not launch instances w/ instance storage for ephemeral storage resource requests when exceeding blockDeviceMapping", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{
Expand Down
12 changes: 8 additions & 4 deletions pkg/providers/instancetype/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func NewInstanceType(ctx context.Context, info *ec2.InstanceTypeInfo, region str
},
}
if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(v1.LabelOSStable, v1.NodeSelectorOpIn, string(v1.Windows)))) == nil {
it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(info)
it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(aws.StringValue(info.InstanceType))
}
return it
}
Expand Down Expand Up @@ -263,6 +263,7 @@ func ephemeralStorage(info *ec2.InstanceTypeInfo, amiFamily amifamily.AMIFamily,
return amifamily.DefaultEBS.VolumeSize
}

// awsPodENI relies on the VPC resource controller to populate the vpc.amazonaws.com/pod-eni resource
func awsPodENI(name string) *resource.Quantity {
// https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html#supported-instance-types
limits, ok := Limits[name]
Expand Down Expand Up @@ -350,10 +351,13 @@ func ENILimitedPods(ctx context.Context, info *ec2.InstanceTypeInfo) *resource.Q
return resources.Quantity(fmt.Sprint(usableNetworkInterfaces*(addressesPerInterface-1) + 2))
}

func privateIPv4Address(info *ec2.InstanceTypeInfo) *resource.Quantity {
func privateIPv4Address(name string) *resource.Quantity {
//https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/ecbd6965a0100d9a070110233762593b16023287/pkg/provider/ip/provider.go#L297
capacity := aws.Int64Value(info.NetworkInfo.Ipv4AddressesPerInterface) - 1
return resources.Quantity(fmt.Sprint(capacity))
limits, ok := Limits[name]
if !ok {
return resources.Quantity("0")
}
return resources.Quantity(fmt.Sprint(limits.IPv4PerInterface - 1))
}

func systemReservedResources(systemReserved map[string]string) v1.ResourceList {
Expand Down
5 changes: 0 additions & 5 deletions test/pkg/environment/aws/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ var WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9"

var EphemeralInitContainerImage = "alpine"

// ExcludedInstanceFamilies denotes instance families that have issues during resource registration due to compatibility
// issues with versions of the VPR Resource Controller.
// TODO: jmdeal@ remove a1 from exclusion list once Karpenter implicitly filters a1 instances for AL2023 AMI family (incompatible)
var ExcludedInstanceFamilies = []string{"m7a", "r7a", "c7a", "r7i", "a1", "c7i-flex"}

type Environment struct {
*common.Environment
Region string
Expand Down
30 changes: 5 additions & 25 deletions test/suites/ami/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,17 @@ limitations under the License.
package ami_test

import (
"testing"

awssdk "github.com/aws/aws-sdk-go/aws"

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"encoding/base64"
"fmt"
"os"
"strings"
"testing"
"time"

awssdk "github.com/aws/aws-sdk-go/aws"

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"

. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -194,17 +193,6 @@ var _ = Describe("AMI", func() {
return v1beta1.AMISelectorTerm{ID: env.GetAMIBySSMPath(ssmPath)}
})
}
// TODO: remove requirements after Ubuntu fixes bootstrap script issue w/
// new instance types not included in the max-pods.txt file. (https://github.com/aws/karpenter-provider-aws/issues/4472)
nodePool = coretest.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: environmentaws.ExcludedInstanceFamilies,
},
},
)
pod := coretest.Pod()
env.ExpectCreated(nodeClass, nodePool, pod)
env.EventuallyExpectHealthy(pod)
Expand Down Expand Up @@ -343,15 +331,7 @@ var _ = Describe("AMI", func() {
nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}}
nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}}

// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
nodePool = coretest.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: environmentaws.ExcludedInstanceFamilies,
},
},
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1.LabelOSStable,
Expand Down
15 changes: 2 additions & 13 deletions test/suites/integration/extended_resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ import (

corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
awsenv "github.com/aws/karpenter-provider-aws/test/pkg/environment/aws"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
)

var _ = Describe("Extended Resources", func() {
Expand Down Expand Up @@ -112,16 +111,6 @@ var _ = Describe("Extended Resources", func() {
DeferCleanup(func() {
env.ExpectPodENIDisabled()
})
// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
test.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: awsenv.ExcludedInstanceFamilies,
},
},
)
numPods := 1
dep := test.Deployment(test.DeploymentOptions{
Replicas: int32(numPods),
Expand Down
8 changes: 0 additions & 8 deletions test/suites/integration/kubelet_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,15 +121,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() {
// Need to enable nodepool-level OS-scoping for now since DS evaluation is done off of the nodepool
// requirements, not off of the instance type options so scheduling can fail if nodepool aren't
// properly scoped
// TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.*, c7a.* ENI data (https://github.com/aws/karpenter-provider-aws/issues/4472)
test.ReplaceRequirements(nodePool,
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1beta1.LabelInstanceFamily,
Operator: v1.NodeSelectorOpNotIn,
Values: aws.ExcludedInstanceFamilies,
},
},
corev1beta1.NodeSelectorRequirementWithMinValues{
NodeSelectorRequirement: v1.NodeSelectorRequirement{
Key: v1.LabelOSStable,
Expand Down
Loading

0 comments on commit 281c614

Please sign in to comment.