Skip to content

Commit

Permalink
Add support for GPU enabled bottlerocket nodes (#298)
Browse files Browse the repository at this point in the history
  • Loading branch information
aidy authored Apr 12, 2022
1 parent a58487d commit 7339634
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 6 deletions.
7 changes: 7 additions & 0 deletions examples/cluster/bottlerocket_gpu_node_group/cluster.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
data "terraform_remote_state" "cluster" {
backend = "local"

config = {
path = "${path.module}/../terraform.tfstate"
}
}
30 changes: 30 additions & 0 deletions examples/cluster/bottlerocket_gpu_node_group/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "3.53.0"
}
}
}

provider "aws" {
region = "us-east-1"
allowed_account_ids = ["214219211678"]
}

module "node_group" {
source = "../../../modules/asg_node_group"

cluster_config = data.terraform_remote_state.cluster.outputs.cluster_config

name = "bottlerocket-gpu-nodes"
key_name = "development"
bottlerocket = true
gpu = true
instance_types = ["g4dn.xlarge"]
min_size = 1

labels = {
"cookpad.com/terraform-aws-eks-test-environment" = data.terraform_remote_state.cluster.outputs.cluster_name
}
}
2 changes: 1 addition & 1 deletion modules/asg_node_group/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ module "bottlerocket_nodes" {
enable the [AWS EBS CSI driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver) by setting `aws_ebs_csi_driver = true` on the cluster module.
see: https://github.com/bottlerocket-os/bottlerocket/blob/develop/QUICKSTART-EKS.md#csi-plugin

⚠️ Bottlerocket does not yet [support GPU nodes](https://github.com/bottlerocket-os/bottlerocket/issues/769), do not set `gpu = true` when `bottlerocket = true`, as this may result in an invalid configuration!
⚠️ Bottlerocket now [supports GPU nodes](https://github.com/bottlerocket-os/bottlerocket/blob/develop/QUICKSTART-EKS.md#aws-k8s--nvidia-variants), set `gpu = true` to enable them. Ensure that you set `instance_types` to a GPU instance type.

📝 If you want to get a shell session on your instances via Bottlerocket's SSM agent
you will need to attach the `arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore` policy
Expand Down
3 changes: 2 additions & 1 deletion modules/asg_node_group/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ locals {
labels = merge(
{ "node-group.k8s.cookpad.com/name" = local.node_group_label },
var.gpu ? { "nvidia.com/gpu" = "true" } : {},
var.bottlerocket ? { "bottlerocket" = "true" } : {},
var.labels,
)
}
Expand All @@ -65,7 +66,7 @@ data "aws_ami" "image" {
}

data "aws_ssm_parameter" "bottlerocket_image_id" {
name = "/aws/service/bottlerocket/aws-k8s-${local.k8s_version}/x86_64/latest/image_id"
name = "/aws/service/bottlerocket/aws-k8s-${local.k8s_version}${var.gpu ? "-nvidia" : ""}/x86_64/latest/image_id"
}

data "aws_ami" "bottlerocket_image" {
Expand Down
13 changes: 11 additions & 2 deletions modules/cluster/addons/helm/nvidia-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,11 @@
nodeSelector:
nvidia.com/gpu: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu
operator: In
values:
- "true"
- key: bottlerocket
operator: DoesNotExist
13 changes: 11 additions & 2 deletions modules/cluster/addons/nvidia-device-plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,17 @@ spec:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
nodeSelector:
nvidia.com/gpu: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu
operator: In
values:
- "true"
- key: bottlerocket
operator: DoesNotExist
tolerations:
- key: CriticalAddonsOnly
operator: Exists
Expand Down
12 changes: 12 additions & 0 deletions test/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,18 @@ func TestTerraformAwsEksCluster(t *testing.T) {
validateKubeBench(t, kubeconfig)
validateNodeTerminationHandler(t, kubeconfig)
})

test_structure.RunTestStage(t, "validate_bottlerocket_gpu_node_group", func() {
terraformOptions := test_structure.LoadTerraformOptions(t, workingDir)
kubeconfig := writeKubeconfig(t, terraform.Output(t, terraformOptions, "cluster_name"))
defer os.Remove(kubeconfig)
gpuNodeGroupDir := "../examples/cluster/bottlerocket_gpu_node_group"
deployTerraform(t, gpuNodeGroupDir, map[string]interface{}{})
defer cleanupTerraform(t, gpuNodeGroupDir)
validateGPUNodes(t, kubeconfig)
validateKubeBench(t, kubeconfig)
validateNodeTerminationHandler(t, kubeconfig)
})
}

func validateNodeLabels(t *testing.T, kubeconfig string, clusterName string) {
Expand Down

0 comments on commit 7339634

Please sign in to comment.