From ab266a52c5b8f885d1d99afb7a85cd653133adaf Mon Sep 17 00:00:00 2001 From: Vara Bonthu Date: Thu, 25 Jan 2024 09:17:33 -0800 Subject: [PATCH] feat: EMR on EKS Blueprint upgrade with Karpenter v0.32.1 (#402) --- .../terraform/emr-eks-karpenter/README.md | 5 +- .../terraform/emr-eks-karpenter/addons.tf | 15 +- .../benchmark/tpcds-benchmark-1t.yaml | 5 +- .../tpcds-benchmark-data-generation-1t.yaml | 5 +- .../driver-pod-template.yaml | 6 +- .../execute_emr_eks_job.sh | 6 +- .../executor-pod-template.yaml | 6 +- .../driver-pod-template.yaml | 6 +- .../executor-pod-template.yaml | 6 +- .../driver-pod-template.yaml | 6 +- .../executor-pod-template.yaml | 6 +- .../driver-pod-template.yaml | 6 +- .../executor-pod-template.yaml | 6 +- .../spark-compute-optimized-provisioner.yaml | 153 +++++++++-------- ...graviton-memory-optimized-provisioner.yaml | 159 +++++++++--------- .../spark-memory-optimized-provisioner.yaml | 159 +++++++++--------- analytics/terraform/emr-eks-karpenter/main.tf | 2 +- .../terraform/emr-eks-karpenter/variables.tf | 2 +- .../amazon-emr-on-eks/emr-eks-karpenter.md | 19 +-- 19 files changed, 295 insertions(+), 283 deletions(-) diff --git a/analytics/terraform/emr-eks-karpenter/README.md b/analytics/terraform/emr-eks-karpenter/README.md index e82035e8b..729388a13 100644 --- a/analytics/terraform/emr-eks-karpenter/README.md +++ b/analytics/terraform/emr-eks-karpenter/README.md @@ -30,7 +30,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ |------|--------|---------| | [amp\_ingest\_irsa](#module\_amp\_ingest\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | | [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 | -| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 | +| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.18 | | [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 | | [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.0 | | [emr\_containers](#module\_emr\_containers) | terraform-aws-modules/emr/aws//modules/virtual-cluster | ~> 1.0 | @@ -64,6 +64,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [kubernetes_secret_v1.spark_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | | [kubernetes_service_account_v1.spark_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | | [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | +| [random_string.grafana](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_ecr_authorization_token.token](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ecr_authorization_token) | data source | @@ -81,7 +82,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no | +| [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.28"` | no | | [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | | [enable\_emr\_spark\_operator](#input\_enable\_emr\_spark\_operator) | Enable the Spark Operator to submit jobs with EMR Runtime | `bool` | `false` | no | | [enable\_fsx\_for\_lustre](#input\_enable\_fsx\_for\_lustre) | Deploys fsx for lustre addon, storage class and static FSx for Lustre filesystem for EMR | `bool` | `false` | no | diff --git a/analytics/terraform/emr-eks-karpenter/addons.tf b/analytics/terraform/emr-eks-karpenter/addons.tf index 51d824fb0..679a4adce 100644 --- a/analytics/terraform/emr-eks-karpenter/addons.tf +++ b/analytics/terraform/emr-eks-karpenter/addons.tf @@ -21,7 +21,7 @@ module "ebs_csi_driver_irsa" { #--------------------------------------------------------------- module "eks_blueprints_addons" { source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.2" + version = "~> 1.2" # change this to version = 1.2.2 for oldder version of Karpenter deployment cluster_name = module.eks.cluster_name cluster_endpoint = module.eks.cluster_endpoint @@ -87,11 +87,14 @@ module "eks_blueprints_addons" { enable_karpenter = true karpenter_enable_spot_termination = true karpenter_node = { + iam_role_use_name_prefix = false + iam_role_name = "${local.name}-karpenter-node" iam_role_additional_policies = { AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" } } karpenter = { + chart_version = "v0.33.1" repository_username = data.aws_ecrpublic_authorization_token.token.user_name repository_password = data.aws_ecrpublic_authorization_token.token.password } @@ -107,7 +110,7 @@ module "eks_blueprints_addons" { #--------------------------------------- # Adding AWS Load Balancer Controller #--------------------------------------- - enable_aws_load_balancer_controller = true + enable_aws_load_balancer_controller = false #--------------------------------------- # Enable FSx for Lustre CSI Driver @@ -189,6 +192,7 @@ resource "kubectl_manifest" "spark_monitor" { depends_on = [module.eks_blueprints_addons] } + #--------------------------------------------------------------- # Data on EKS Kubernetes Addons #--------------------------------------------------------------- @@ -278,9 +282,14 @@ resource "random_password" "grafana" { override_special = "@_" } +resource "random_string" "grafana" { + length = 4 + lower = true +} + #tfsec:ignore:aws-ssm-secret-use-customer-key resource "aws_secretsmanager_secret" "grafana" { - name = "${local.name}-grafana" + name = "${local.name}-grafana-${random_string.grafana.result}" recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy } diff --git a/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-1t.yaml b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-1t.yaml index e3e89c729..d7676f206 100644 --- a/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-1t.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-1t.yaml @@ -66,9 +66,10 @@ spec: # spark.kubernetes.allocation.batch.size: "20" # default 5 but adjust according to your cluster size # ----------------------------------------------------- volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 + emptyDir: {} driver: volumeMounts: - name: spark-local-dir-1 diff --git a/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml index 4b85df7e8..486cae172 100644 --- a/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml @@ -65,9 +65,10 @@ spec: restartPolicy: type: Never volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 + emptyDir: {} driver: volumeMounts: - name: spark-local-dir-1 diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml index e31b03a26..e6f053b99 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml @@ -5,10 +5,10 @@ metadata: namespace: emr-data-team-a spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: provisioner: spark-compute-optimized diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh index 888f7ece2..0debe4038 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh @@ -39,10 +39,10 @@ mkdir -p "../input" wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "../input/yellow_tripdata_2022-0.parquet" # Making duplicate copies to increase the size of the data. -max=100 +max=20 for (( i=1; i <= $max; ++i )) do - cp -rf "../input/yellow_tripdata_2022-0.parquet" "../input/yellow_tripdata_2022-${i}.parquet" +cp -rf "../input/yellow_tripdata_2022-0.parquet" "../input/yellow_tripdata_2022-${i}.parquet" done aws s3 sync "../input" ${INPUT_DATA_S3_PATH} # Sync from local folder to S3 path @@ -64,7 +64,7 @@ aws emr-containers start-job-run \ "entryPointArguments": ["'"$INPUT_DATA_S3_PATH"'", "'"$OUTPUT_DATA_S3_PATH"'" ], - "sparkSubmitParameters": "--conf spark.executor.instances=10" + "sparkSubmitParameters": "--conf spark.executor.instances=2" } }' \ --configuration-overrides '{ diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml index 13dfb5ed2..b3b8a30fb 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml @@ -6,10 +6,10 @@ metadata: spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: provisioner: spark-compute-optimized diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/driver-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/driver-pod-template.yaml index d2641cd37..a175bd2b5 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/driver-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/driver-pod-template.yaml @@ -5,10 +5,10 @@ metadata: namespace: emr-data-team-a spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: NodeGroupType: "SparkGravitonMemoryOptimized" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/executor-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/executor-pod-template.yaml index 2c9c10af2..3c4f606eb 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/executor-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/executor-pod-template.yaml @@ -5,10 +5,10 @@ metadata: namespace: emr-data-team-a spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: NodeGroupType: "SparkGravitonMemoryOptimized" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/driver-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/driver-pod-template.yaml index 1247783ba..396ef8b5f 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/driver-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/driver-pod-template.yaml @@ -5,10 +5,10 @@ metadata: namespace: emr-data-team-a spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: NodeGroupType: "SparkMemoryOptimized" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/executor-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/executor-pod-template.yaml index e97d76092..f15d8af00 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/executor-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/executor-pod-template.yaml @@ -6,10 +6,10 @@ metadata: spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: NodeGroupType: "SparkMemoryOptimized" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/driver-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/driver-pod-template.yaml index f3677b04f..0bbf8fce0 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/driver-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/driver-pod-template.yaml @@ -40,10 +40,10 @@ metadata: }] spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: NodeGroupType: "SparkMemoryOptimized" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/executor-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/executor-pod-template.yaml index 7f2eae1e2..bd322e905 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/executor-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/executor-pod-template.yaml @@ -11,10 +11,10 @@ metadata: spec: volumes: + # This is using the temp storage on the node. + # if you are using NVMe SSD then karpenter will configure the RAID0 under /mnt/k8s-disks/0 and copies the shuffle data to this location - name: spark-local-dir-1 - hostPath: - path: /local1 - type: Directory + emptyDir: {} nodeSelector: NodeGroupType: "SparkMemoryOptimized" diff --git a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml index b3af9c0a4..eb126f996 100644 --- a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml +++ b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml @@ -1,61 +1,89 @@ -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool # Previously kind: Provisioner metadata: name: spark-compute-optimized namespace: karpenter # Same namespace as Karpenter add-on installed spec: - kubeletConfiguration: - containerRuntime: containerd - # podsPerCore: 2 - # maxPods: 20 - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}a] #Update the correct region and zones - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered - operator: In - values: ["c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] + template: + metadata: + labels: + type: karpenter + provisioner: spark-compute-optimized + NodeGroupType: SparkComputeOptimized + spec: + nodeClassRef: + name: spark-compute-optimized + requirements: + - key: "topology.kubernetes.io/zone" + operator: In + values: [${azs}a] #Update the correct region and zones + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["c5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "36"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] limits: - resources: - cpu: 2000 - providerRef: - name: spark-compute-optimized - labels: - type: karpenter - provisioner: spark-compute-optimized - ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set + cpu: 1000 + disruption: + # Describes which types of Nodes Karpenter should consider for consolidation + # If using 'WhenUnderutilized', Karpenter will consider all nodes for consolidation and attempt to remove or replace Nodes when it discovers that the Node is underutilized and could be changed to reduce cost + # If using `WhenEmpty`, Karpenter will only consider nodes for consolidation that contain no workload pods + consolidationPolicy: WhenEmpty + # The amount of time Karpenter should wait after discovering a consolidation decision + # This value can currently only be set when the consolidationPolicy is 'WhenEmpty' + # You can choose to disable consolidation entirely by setting the string value 'Never' here + consolidateAfter: 30s + # The amount of time a Node can live on the cluster before being removed + # Avoiding long-running Nodes helps to reduce security vulnerabilities as well as to reduce the chance of issues that can plague Nodes with long uptimes such as file fragmentation or memory leaks from system processes + # You can choose to disable expiration entirely by setting the string value 'Never' here + expireAfter: 720h + + # Priority given to the NodePool when the scheduler considers which NodePool + # to select. Higher weights indicate higher priority when comparing NodePools. + # Specifying no weight is equivalent to specifying a weight of 0. + weight: 10 + + +# NOTE: Multiple NodePools may point to the same EC2NodeClass. --- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass # Previously kind: AWSNodeTemplate metadata: name: spark-compute-optimized namespace: karpenter spec: + amiFamily: AL2 blockDeviceMappings: - deviceName: /dev/xvda ebs: - volumeSize: 100Gi + volumeSize: 50Gi volumeType: gp3 encrypted: true deleteOnTermination: true - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - subnetSelector: - Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes - # instanceProfile: "" # optional, if already set in controller args - #RAID0 config example + role: "${eks_cluster_id}-karpenter-node" + subnetSelectorTerms: + - tags: # Update the correct region and zones + Name: "${eks_cluster_id}-private*" + securityGroupSelectorTerms: + - name: "${eks_cluster_id}-node*" userData: | MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="BOUNDARY" @@ -63,42 +91,21 @@ spec: --BOUNDARY Content-Type: text/x-shellscript; charset="us-ascii" - #!/bin/bash - echo "Running a custom user data script" - set -ex - yum install mdadm -y + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh - DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}') - DISK_ARRAY=() + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF - for DEV in $DEVICES - do - DISK_ARRAY+=("/dev/$${DEV}") - done - - DISK_COUNT=$${#DISK_ARRAY[@]} - - if [ $${DISK_COUNT} -eq 0 ]; then - echo "No SSD disks available. No further action needed." - else - if [ $${DISK_COUNT} -eq 1 ]; then - TARGET_DEV=$${DISK_ARRAY[0]} - mkfs.xfs $${TARGET_DEV} - else - mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]} - mkfs.xfs /dev/md0 - TARGET_DEV=/dev/md0 - fi - - mkdir -p /local1 - echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab - mount -a - # NOTE: Update permissions on folder according to your needs and specific user group. This is just an example. - chmod 777 -R /local* - fi + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh --BOUNDARY-- - tags: InstanceType: "spark-compute-optimized" # optional, add tags for your own use diff --git a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml index e73e7e7a0..12fab4a7c 100644 --- a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml +++ b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml @@ -1,66 +1,89 @@ -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool # Previously kind: Provisioner metadata: name: spark-graviton-memory-optimized - namespace: karpenter + namespace: karpenter # Same namespace as Karpenter add-on installed spec: - kubeletConfiguration: - containerRuntime: containerd -# podsPerCore: 2 -# maxPods: 20 - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}b] #Update the correct region and zone - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered - operator: In - values: ["r6gd.4xlarge", "r6gd.8xlarge"] # 2 NVMe disk - - key: "kubernetes.io/arch" - operator: In - values: ["arm64"] + template: + metadata: + labels: + type: karpenter + provisioner: spark-graviton-memory-optimized + NodeGroupType: SparkGravitonMemoryOptimized + spec: + nodeClassRef: + name: spark-graviton-memory-optimized + requirements: + - key: "topology.kubernetes.io/zone" + operator: In + values: [${azs}a] #Update the correct region and zones + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r6gd"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] limits: - resources: - cpu: 1000 - providerRef: # optional, recommended to use instead of `provider` - name: spark-graviton-memory-optimized - labels: - type: karpenter - provisioner: spark-graviton-memory-optimized - NodeGroupType: SparkGravitonMemoryOptimized - taints: - - key: spark-graviton-memory-optimized - value: 'true' - effect: NoSchedule - ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set + cpu: 1000 + disruption: + # Describes which types of Nodes Karpenter should consider for consolidation + # If using 'WhenUnderutilized', Karpenter will consider all nodes for consolidation and attempt to remove or replace Nodes when it discovers that the Node is underutilized and could be changed to reduce cost + # If using `WhenEmpty`, Karpenter will only consider nodes for consolidation that contain no workload pods + consolidationPolicy: WhenEmpty + # The amount of time Karpenter should wait after discovering a consolidation decision + # This value can currently only be set when the consolidationPolicy is 'WhenEmpty' + # You can choose to disable consolidation entirely by setting the string value 'Never' here + consolidateAfter: 30s + # The amount of time a Node can live on the cluster before being removed + # Avoiding long-running Nodes helps to reduce security vulnerabilities as well as to reduce the chance of issues that can plague Nodes with long uptimes such as file fragmentation or memory leaks from system processes + # You can choose to disable expiration entirely by setting the string value 'Never' here + expireAfter: 720h + + # Priority given to the NodePool when the scheduler considers which NodePool + # to select. Higher weights indicate higher priority when comparing NodePools. + # Specifying no weight is equivalent to specifying a weight of 0. + weight: 10 + + +# NOTE: Multiple NodePools may point to the same EC2NodeClass. --- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass # Previously kind: AWSNodeTemplate metadata: name: spark-graviton-memory-optimized namespace: karpenter spec: + amiFamily: AL2 blockDeviceMappings: - deviceName: /dev/xvda ebs: - volumeSize: 200Gi + volumeSize: 100Gi volumeType: gp3 encrypted: true deleteOnTermination: true - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - subnetSelector: - Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes - # instanceProfile: "" # optional, if already set in controller args - #RAID0 config example + role: "${eks_cluster_id}-karpenter-node" + subnetSelectorTerms: + - tags: # Update the correct region and zones + Name: "${eks_cluster_id}-private*" + securityGroupSelectorTerms: + - name: "${eks_cluster_id}-node*" userData: | MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="BOUNDARY" @@ -68,41 +91,21 @@ spec: --BOUNDARY Content-Type: text/x-shellscript; charset="us-ascii" - #!/bin/bash - echo "Running a custom user data script" - set -ex - yum install mdadm -y + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh - DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}') - DISK_ARRAY=() + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF - for DEV in $DEVICES - do - DISK_ARRAY+=("/dev/$${DEV}") - done - - DISK_COUNT=$${#DISK_ARRAY[@]} - - if [ $${DISK_COUNT} -eq 0 ]; then - echo "No SSD disks available. No further action needed." - else - if [ $${DISK_COUNT} -eq 1 ]; then - TARGET_DEV=$${DISK_ARRAY[0]} - mkfs.xfs $${TARGET_DEV} - else - mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]} - mkfs.xfs /dev/md0 - TARGET_DEV=/dev/md0 - fi - - mkdir -p /local1 - echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab - mount -a - /usr/bin/chown -hR +999:+1000 /local1 - fi + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh --BOUNDARY-- - tags: InstanceType: "spark-graviton-memory-optimized" # optional, add tags for your own use diff --git a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-memory-optimized-provisioner.yaml b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-memory-optimized-provisioner.yaml index d99bc8c26..4e876f808 100644 --- a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-memory-optimized-provisioner.yaml +++ b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-memory-optimized-provisioner.yaml @@ -1,66 +1,89 @@ -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool # Previously kind: Provisioner metadata: name: spark-memory-optimized - namespace: karpenter + namespace: karpenter # Same namespace as Karpenter add-on installed spec: - kubeletConfiguration: - containerRuntime: containerd -# podsPerCore: 2 -# maxPods: 20 - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}b] #Update the correct region and zone - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered - operator: In - values: ["r5d.4xlarge","r5d.8xlarge","r5d.12xlarge"] # 2 NVMe disk - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] + template: + metadata: + labels: + type: karpenter + provisioner: spark-memory-optimized + NodeGroupType: SparkMemoryOptimized + spec: + nodeClassRef: + name: spark-memory-optimized + requirements: + - key: "topology.kubernetes.io/zone" + operator: In + values: [${azs}a] #Update the correct region and zones + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] limits: - resources: - cpu: 1000 - providerRef: - name: spark-memory-optimized - labels: - type: karpenter - provisioner: spark-memory-optimized - NodeGroupType: SparkMemoryOptimized - taints: - - key: spark-memory-optimized - value: 'true' - effect: NoSchedule - ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set + cpu: 1000 + disruption: + # Describes which types of Nodes Karpenter should consider for consolidation + # If using 'WhenUnderutilized', Karpenter will consider all nodes for consolidation and attempt to remove or replace Nodes when it discovers that the Node is underutilized and could be changed to reduce cost + # If using `WhenEmpty`, Karpenter will only consider nodes for consolidation that contain no workload pods + consolidationPolicy: WhenEmpty + # The amount of time Karpenter should wait after discovering a consolidation decision + # This value can currently only be set when the consolidationPolicy is 'WhenEmpty' + # You can choose to disable consolidation entirely by setting the string value 'Never' here + consolidateAfter: 30s + # The amount of time a Node can live on the cluster before being removed + # Avoiding long-running Nodes helps to reduce security vulnerabilities as well as to reduce the chance of issues that can plague Nodes with long uptimes such as file fragmentation or memory leaks from system processes + # You can choose to disable expiration entirely by setting the string value 'Never' here + expireAfter: 720h + + # Priority given to the NodePool when the scheduler considers which NodePool + # to select. Higher weights indicate higher priority when comparing NodePools. + # Specifying no weight is equivalent to specifying a weight of 0. + weight: 10 + + +# NOTE: Multiple NodePools may point to the same EC2NodeClass. --- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass # Previously kind: AWSNodeTemplate metadata: name: spark-memory-optimized namespace: karpenter spec: + amiFamily: AL2 blockDeviceMappings: - deviceName: /dev/xvda ebs: - volumeSize: 200Gi + volumeSize: 100Gi volumeType: gp3 encrypted: true deleteOnTermination: true - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - subnetSelector: - Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes - # instanceProfile: "" # optional, if already set in controller args - #RAID0 config example + role: "${eks_cluster_id}-karpenter-node" + subnetSelectorTerms: + - tags: # Update the correct region and zones + Name: "${eks_cluster_id}-private*" + securityGroupSelectorTerms: + - name: "${eks_cluster_id}-node*" userData: | MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="BOUNDARY" @@ -68,41 +91,21 @@ spec: --BOUNDARY Content-Type: text/x-shellscript; charset="us-ascii" - #!/bin/bash - echo "Running a custom user data script" - set -ex - yum install mdadm -y + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh - DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}') - DISK_ARRAY=() + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF - for DEV in $DEVICES - do - DISK_ARRAY+=("/dev/$${DEV}") - done - - DISK_COUNT=$${#DISK_ARRAY[@]} - - if [ $${DISK_COUNT} -eq 0 ]; then - echo "No SSD disks available. No further action needed." - else - if [ $${DISK_COUNT} -eq 1 ]; then - TARGET_DEV=$${DISK_ARRAY[0]} - mkfs.xfs $${TARGET_DEV} - else - mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]} - mkfs.xfs /dev/md0 - TARGET_DEV=/dev/md0 - fi - - mkdir -p /local1 - echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab - mount -a - /usr/bin/chown -hR +999:+1000 /local1 - fi + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh --BOUNDARY-- - tags: InstanceType: "spark-memory-optimized" # optional, add tags for your own use diff --git a/analytics/terraform/emr-eks-karpenter/main.tf b/analytics/terraform/emr-eks-karpenter/main.tf index 7dc9afbd4..fce651336 100644 --- a/analytics/terraform/emr-eks-karpenter/main.tf +++ b/analytics/terraform/emr-eks-karpenter/main.tf @@ -69,7 +69,7 @@ locals { module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 19.15" + version = "~> 19.18" cluster_name = local.name cluster_version = var.eks_cluster_version diff --git a/analytics/terraform/emr-eks-karpenter/variables.tf b/analytics/terraform/emr-eks-karpenter/variables.tf index 58524f083..1b9ecc560 100644 --- a/analytics/terraform/emr-eks-karpenter/variables.tf +++ b/analytics/terraform/emr-eks-karpenter/variables.tf @@ -11,7 +11,7 @@ variable "region" { variable "eks_cluster_version" { description = "EKS Cluster version" type = string - default = "1.27" + default = "1.28" } variable "tags" { description = "Default tags" diff --git a/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md b/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md index f58cec3e3..04eaf7773 100644 --- a/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md +++ b/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md @@ -115,27 +115,14 @@ First, clone the repository git clone https://github.com/awslabs/data-on-eks.git ``` -Navigate into one of the example directories and run `terraform init` +Navigate into one of the example directories and run `install.sh` script ```bash cd data-on-eks/analytics/terraform/emr-eks-karpenter -terraform init +chmod +x install.sh +./install.sh ``` -Set AWS_REGION and Run Terraform plan to verify the resources created by this execution. - -```bash -export AWS_REGION="us-west-2" -terraform plan -``` - -This command may take between 20 and 30 minutes to create all the resources. - -```bash -terraform apply -``` -Enter `yes` to apply. - ### Verify the resources Verify the Amazon EKS Cluster and Amazon Managed service for Prometheus