diff --git a/schedulers/terraform/argo-workflow/README.md b/schedulers/terraform/argo-workflow/README.md
index b0638ac48..b70782fff 100644
--- a/schedulers/terraform/argo-workflow/README.md
+++ b/schedulers/terraform/argo-workflow/README.md
@@ -19,7 +19,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
|------|---------|
| [aws](#provider\_aws) | >= 3.72 |
| [aws.ecr](#provider\_aws.ecr) | >= 3.72 |
-| [kubectl](#provider\_kubectl) | >= 1.14 |
| [kubernetes](#provider\_kubernetes) | >= 2.10 |
| [random](#provider\_random) | 3.3.2 |
@@ -30,8 +29,8 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [amp\_ingest\_irsa](#module\_amp\_ingest\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
| [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
-| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | 1.9.2 |
-| [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.0 |
+| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
+| [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.30 |
| [irsa\_argo\_events](#module\_irsa\_argo\_events) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
| [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
| [spark\_team\_a\_irsa](#module\_spark\_team\_a\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
@@ -50,7 +49,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [aws_s3_object.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_object) | resource |
| [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
| [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource |
-| [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
| [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource |
| [kubernetes_cluster_role.spark_argowf_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource |
| [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource |
@@ -75,7 +73,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source |
| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
| [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source |
-| [kubectl_path_documents.karpenter_provisioners](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source |
## Inputs
diff --git a/schedulers/terraform/argo-workflow/addons.tf b/schedulers/terraform/argo-workflow/addons.tf
index 32e7d307f..c07ca7bb2 100644
--- a/schedulers/terraform/argo-workflow/addons.tf
+++ b/schedulers/terraform/argo-workflow/addons.tf
@@ -59,7 +59,7 @@ module "ebs_csi_driver_irsa" {
#---------------------------------------------------------------
module "eks_blueprints_addons" {
source = "aws-ia/eks-blueprints-addons/aws"
- version = "1.9.2"
+ version = "~> 1.2"
cluster_name = module.eks.cluster_name
@@ -100,24 +100,30 @@ module "eks_blueprints_addons" {
description = "Cluster Proportional Autoscaler for CoreDNS Service"
}
- #---------------------------------------
- # Metrics Server
- #---------------------------------------
- enable_metrics_server = true
- metrics_server = {
- values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
- }
-
#---------------------------------------
# Karpenter Autoscaler for EKS Cluster
#---------------------------------------
enable_karpenter = true
karpenter_enable_spot_termination = true
+ karpenter_node = {
+ iam_role_additional_policies = {
+ AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+ }
+ }
karpenter = {
+ chart_version = "v0.34.0"
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}
+ #---------------------------------------
+ # Metrics Server
+ #---------------------------------------
+ enable_metrics_server = true
+ metrics_server = {
+ values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
+ }
+
#---------------------------------------
# AWS for FluentBit - DaemonSet
#---------------------------------------
@@ -199,7 +205,7 @@ module "eks_blueprints_addons" {
#---------------------------------------------------------------
module "eks_data_addons" {
source = "aws-ia/eks-data-addons/aws"
- version = "~> 1.0" # ensure to update this to the latest/desired version
+ version = "~> 1.30" # ensure to update this to the latest/desired version
oidc_provider_arn = module.eks.oidc_provider_arn
@@ -234,26 +240,116 @@ module "eks_data_addons" {
]
}
-}
-
-#---------------------------------------
-# Karpenter Provisioners
-#---------------------------------------
-data "kubectl_path_documents" "karpenter_provisioners" {
- pattern = "${path.module}/karpenter-provisioners/spark-*.yaml"
- vars = {
- azs = local.region
- eks_cluster_id = module.eks.cluster_name
+ #---------------------------------------
+ # Karpenter Autoscaler for EKS Cluster
+ #---------------------------------------
+ enable_karpenter_resources = true
+ karpenter_resources_helm_config = {
+ spark-compute-optimized = {
+ values = [
+ <<-EOT
+ name: spark-compute-optimized
+ clusterName: ${module.eks.cluster_name}
+ ec2NodeClass:
+ karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+ subnetSelectorTerms:
+ tags:
+ Name: "${module.eks.cluster_name}-private*"
+ securityGroupSelectorTerms:
+ tags:
+ Name: ${module.eks.cluster_name}-node
+ instanceStorePolicy: RAID0
+ nodePool:
+ labels:
+ - type: karpenter
+ - NodeGroupType: SparkComputeOptimized
+ - multiArch: Spark
+ requirements:
+ - key: "karpenter.sh/capacity-type"
+ operator: In
+ values: ["spot", "on-demand"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["amd64"]
+ - key: "karpenter.k8s.aws/instance-category"
+ operator: In
+ values: ["c"]
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["c5d"]
+ - key: "karpenter.k8s.aws/instance-cpu"
+ operator: In
+ values: ["4", "8", "16", "36"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+ - key: "karpenter.k8s.aws/instance-generation"
+ operator: Gt
+ values: ["2"]
+ limits:
+ cpu: 1000
+ disruption:
+ consolidationPolicy: WhenEmpty
+ consolidateAfter: 30s
+ expireAfter: 720h
+ weight: 100
+ EOT
+ ]
+ }
+ spark-graviton-compute-optimized = {
+ values = [
+ <<-EOT
+ name: spark-graviton-compute-optimized
+ clusterName: ${module.eks.cluster_name}
+ ec2NodeClass:
+ karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+ subnetSelectorTerms:
+ tags:
+ Name: "${module.eks.cluster_name}-private*"
+ securityGroupSelectorTerms:
+ tags:
+ Name: ${module.eks.cluster_name}-node
+ instanceStorePolicy: RAID0
+ nodePool:
+ labels:
+ - type: karpenter
+ - NodeGroupType: SparkGravitonComputeOptimized
+ - multiArch: Spark
+ requirements:
+ - key: "karpenter.sh/capacity-type"
+ operator: In
+ values: ["spot", "on-demand"]
+ - key: "kubernetes.io/arch"
+ operator: In
+ values: ["arm64"]
+ - key: "karpenter.k8s.aws/instance-category"
+ operator: In
+ values: ["c"]
+ - key: "karpenter.k8s.aws/instance-family"
+ operator: In
+ values: ["c7gd"]
+ - key: "karpenter.k8s.aws/instance-cpu"
+ operator: In
+ values: ["4", "8", "16", "32"]
+ - key: "karpenter.k8s.aws/instance-hypervisor"
+ operator: In
+ values: ["nitro"]
+ - key: "karpenter.k8s.aws/instance-generation"
+ operator: Gt
+ values: ["2"]
+ limits:
+ cpu: 1000
+ disruption:
+ consolidationPolicy: WhenEmpty
+ consolidateAfter: 30s
+ expireAfter: 720h
+ weight: 50
+ EOT
+ ]
+ }
}
}
-resource "kubectl_manifest" "karpenter_provisioner" {
- for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
- yaml_body = each.value
-
- depends_on = [module.eks_blueprints_addons]
-}
-
#tfsec:ignore:*
module "s3_bucket" {
source = "terraform-aws-modules/s3-bucket/aws"
diff --git a/schedulers/terraform/argo-workflow/karpenter-provisioners/spark-compute-optimized-provisioner.yaml b/schedulers/terraform/argo-workflow/karpenter-provisioners/spark-compute-optimized-provisioner.yaml
deleted file mode 100644
index b93858da8..000000000
--- a/schedulers/terraform/argo-workflow/karpenter-provisioners/spark-compute-optimized-provisioner.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-apiVersion: karpenter.sh/v1alpha5
-kind: Provisioner
-metadata:
- name: spark-compute-optimized
- namespace: karpenter # Same namespace as Karpenter add-on installed
-spec:
- kubeletConfiguration:
- containerRuntime: containerd
- # podsPerCore: 2
- # maxPods: 20
- requirements:
- - key: "topology.kubernetes.io/zone"
- operator: In
- values: [${azs}a] #Update the correct region and zones
- - key: "karpenter.sh/capacity-type"
- operator: In
- values: ["spot", "on-demand"]
- - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered
- operator: In
- values: ["c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk
- - key: "kubernetes.io/arch"
- operator: In
- values: ["amd64"]
- limits:
- resources:
- cpu: 2000
- providerRef:
- name: spark-compute-optimized
- labels:
- type: karpenter
- provisioner: spark-compute-optimized
- NodeGroupType: SparkComputeOptimized
- taints:
- - key: spark-compute-optimized
- value: 'true'
- effect: NoSchedule
- ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set
-
----
-apiVersion: karpenter.k8s.aws/v1alpha1
-kind: AWSNodeTemplate
-metadata:
- name: spark-compute-optimized
- namespace: karpenter
-spec:
- blockDeviceMappings:
- - deviceName: /dev/xvda
- ebs:
- volumeSize: 100Gi
- volumeType: gp3
- encrypted: true
- deleteOnTermination: true
- metadataOptions:
- httpEndpoint: enabled
- httpProtocolIPv6: disabled
- httpPutResponseHopLimit: 2
- httpTokens: required
- subnetSelector:
- Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes
- securityGroupSelector: # required, when not using launchTemplate
- Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes
- # instanceProfile: "" # optional, if already set in controller args
- #RAID0 config example
- userData: |
- MIME-Version: 1.0
- Content-Type: multipart/mixed; boundary="BOUNDARY"
-
- --BOUNDARY
- Content-Type: text/x-shellscript; charset="us-ascii"
-
- #!/bin/bash
- echo "Running a custom user data script"
- set -ex
- yum install mdadm -y
-
- DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}')
-
- DISK_ARRAY=()
-
- for DEV in $DEVICES
- do
- DISK_ARRAY+=("/dev/$${DEV}")
- done
-
- DISK_COUNT=$${#DISK_ARRAY[@]}
-
- if [ $${DISK_COUNT} -eq 0 ]; then
- echo "No SSD disks available. No further action needed."
- else
- if [ $${DISK_COUNT} -eq 1 ]; then
- TARGET_DEV=$${DISK_ARRAY[0]}
- mkfs.xfs $${TARGET_DEV}
- else
- mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]}
- mkfs.xfs /dev/md0
- TARGET_DEV=/dev/md0
- fi
-
- mkdir -p /local1
- echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab
- mount -a
- # NOTE: Update permissions on folder according to your needs and specific user group. This is just an example.
- chmod 777 -R /local*
- fi
-
- --BOUNDARY--
-
- tags:
- InstanceType: "spark-compute-optimized" # optional, add tags for your own use
diff --git a/schedulers/terraform/argo-workflow/main.tf b/schedulers/terraform/argo-workflow/main.tf
index 254231538..a77b42bf5 100644
--- a/schedulers/terraform/argo-workflow/main.tf
+++ b/schedulers/terraform/argo-workflow/main.tf
@@ -23,14 +23,6 @@ provider "helm" {
}
}
-provider "kubectl" {
- apply_retry_count = 30
- host = module.eks.cluster_endpoint
- cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
- load_config_file = false
- token = data.aws_eks_cluster_auth.this.token
-}
-
data "aws_eks_cluster_auth" "this" {
name = module.eks.cluster_name
}
diff --git a/schedulers/terraform/argo-workflow/workflow-examples/sensor-sqs-sparkjobs.yaml b/schedulers/terraform/argo-workflow/workflow-examples/sensor-sqs-sparkjobs.yaml
index c1eea5f7a..f80d4e70d 100644
--- a/schedulers/terraform/argo-workflow/workflow-examples/sensor-sqs-sparkjobs.yaml
+++ b/schedulers/terraform/argo-workflow/workflow-examples/sensor-sqs-sparkjobs.yaml
@@ -87,11 +87,6 @@ spec:
sparkVersion: "3.1.1"
restartPolicy:
type: Never
- volumes:
- - name: "test-volume"
- hostPath:
- path: "/tmp"
- type: Directory
driver:
cores: 1
coreLimit: "1200m"
@@ -137,10 +132,6 @@ spec:
},
"tolerations": [{"key": "spark-compute-optimized", "operator": "Exists", "effect": "NoSchedule"}]
}]
- volumeMounts:
- - name: "test-volume"
- mountPath: "/tmp"
- readOnly: false
executor:
cores: 1
instances: 4
@@ -158,10 +149,6 @@ spec:
version: 3.3.1
annotations:
yunikorn.apache.org/task-group-name: "spark-executor"
- volumeMounts:
- - name: "test-volume"
- mountPath: "/tmp"
- readOnly: false
- name: sparkapp-operator-taxi
resource:
action: create
@@ -222,23 +209,7 @@ spec:
onFailureRetryInterval: 10
onSubmissionFailureRetries: 5
onSubmissionFailureRetryInterval: 20
- volumes: # using NVMe instance storage mounted on /mnt/k8s-disks
- - name: spark-local-dir-1
- hostPath:
- path: /mnt/k8s-disks
- type: Directory
driver:
- volumeMounts: # Points to InstanceStore 150GB NVMe SSD for shuffle spill over from memory
- - name: spark-local-dir-1
- mountPath: /data1
- readOnly: false
- initContainers:
- - name: volume-permissions
- image: public.ecr.aws/y4g4v0z7/busybox
- command: [ 'sh', '-c', 'chown -R 185 /mnt/k8s-disks' ]
- volumeMounts:
- - mountPath: "/mnt/k8s-disks"
- name: "spark-local-dir-1"
cores: 1
coreLimit: "1200m"
memory: "4g"
@@ -288,17 +259,6 @@ spec:
executor:
podSecurityContext:
fsGroup: 185
- volumeMounts:
- - name: spark-local-dir-1
- mountPath: /data1
- readOnly: false
- initContainers:
- - name: volume-permissions
- image: public.ecr.aws/y4g4v0z7/busybox
- command: [ 'sh', '-c', 'chown -R 185 /mnt/k8s-disks' ]
- volumeMounts:
- - mountPath: "/mnt/k8s-disks"
- name: "spark-local-dir-1"
cores: 1
coreLimit: "1200m"
instances: 4