From e2baa10b4b2e78ba40cf0e72b6b4ef329d4ca835 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Wed, 30 Oct 2024 10:52:22 -0700 Subject: [PATCH 1/5] update eks module and access entries --- .../terraform/spark-k8s-operator/addons.tf | 9 +++++ analytics/terraform/spark-k8s-operator/eks.tf | 36 +++---------------- .../terraform/spark-k8s-operator/versions.tf | 4 +-- 3 files changed, 16 insertions(+), 33 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf index 00bed48e8..ac8fb485b 100644 --- a/analytics/terraform/spark-k8s-operator/addons.tf +++ b/analytics/terraform/spark-k8s-operator/addons.tf @@ -36,6 +36,15 @@ resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" { depends_on = [kubernetes_annotations.gp2_default] } +#--------------------------------------------------------------- +# Karpenter Node instance role Access Entry +#--------------------------------------------------------------- +resource "aws_eks_access_entry" "karpenter_nodes" { + cluster_name = module.eks.cluster_name + principal_arn = module.eks_blueprints_addons.karpenter.node_iam_role_arn + type = "EC2_LINUX" +} + #--------------------------------------------------------------- # Data on EKS Kubernetes Addons #--------------------------------------------------------------- diff --git a/analytics/terraform/spark-k8s-operator/eks.tf b/analytics/terraform/spark-k8s-operator/eks.tf index f2cde7d0d..10ac43f8b 100644 --- a/analytics/terraform/spark-k8s-operator/eks.tf +++ b/analytics/terraform/spark-k8s-operator/eks.tf @@ -3,7 +3,7 @@ #--------------------------------------------------------------- module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 19.15" + version = "~> 20.26" cluster_name = local.name cluster_version = var.eks_cluster_version @@ -11,6 +11,10 @@ module "eks" { #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true + # Add the IAM identity that terraform is using as a cluster admin + authentication_mode = "API_AND_CONFIG_MAP" + enable_cluster_creator_admin_permissions = true + vpc_id = module.vpc.vpc_id # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : @@ -25,19 +29,6 @@ module "eks" { )) - manage_aws_auth_configmap = true - aws_auth_roles = distinct(concat([{ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - }], - var.aws_auth_roles - )) - #--------------------------------------- # Note: This can further restricted to specific required for each Add-on and your application #--------------------------------------- @@ -82,23 +73,6 @@ module "eks" { AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" } - # NVMe instance store volumes are automatically enumerated and assigned a device - pre_bootstrap_user_data = <<-EOT - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - EOT - ebs_optimized = true # This block device is used only for root volume. Adjust volume according to your size. # NOTE: Don't use this volume for Spark workloads diff --git a/analytics/terraform/spark-k8s-operator/versions.tf b/analytics/terraform/spark-k8s-operator/versions.tf index 5d4c12284..0ee78bb21 100644 --- a/analytics/terraform/spark-k8s-operator/versions.tf +++ b/analytics/terraform/spark-k8s-operator/versions.tf @@ -4,7 +4,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 3.72" + version = ">= 5.34" } kubernetes = { source = "hashicorp/kubernetes" @@ -20,7 +20,7 @@ terraform { } random = { source = "hashicorp/random" - version = "3.3.2" + version = ">=3.6.0" } } From 7de1b22e767f6a47eb5087a9f3d05b9e0321e1b0 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Wed, 30 Oct 2024 11:29:41 -0700 Subject: [PATCH 2/5] pre-commit cleanup --- .../terraform/spark-k8s-operator/README.md | 24 +++++++++---------- analytics/terraform/spark-k8s-operator/eks.tf | 2 +- .../terraform/spark-k8s-operator/variables.tf | 15 ------------ 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/README.md b/analytics/terraform/spark-k8s-operator/README.md index 171a59136..bf4e03370 100644 --- a/analytics/terraform/spark-k8s-operator/README.md +++ b/analytics/terraform/spark-k8s-operator/README.md @@ -7,20 +7,20 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.0.0 | -| [aws](#requirement\_aws) | >= 3.72 | +| [aws](#requirement\_aws) | >= 5.34 | | [helm](#requirement\_helm) | >= 2.4.1 | | [kubectl](#requirement\_kubectl) | >= 1.14 | | [kubernetes](#requirement\_kubernetes) | >= 2.10 | -| [random](#requirement\_random) | 3.3.2 | +| [random](#requirement\_random) | >=3.6.0 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | >= 3.72 | -| [aws.ecr](#provider\_aws.ecr) | >= 3.72 | +| [aws](#provider\_aws) | >= 5.34 | +| [aws.ecr](#provider\_aws.ecr) | >= 5.34 | | [kubernetes](#provider\_kubernetes) | >= 2.10 | -| [random](#provider\_random) | 3.3.2 | +| [random](#provider\_random) | >=3.6.0 | ## Modules @@ -28,7 +28,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ |------|--------|---------| | [amp\_ingest\_irsa](#module\_amp\_ingest\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | | [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.34 | -| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 | +| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 20.26 | | [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 | | [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | 1.34 | | [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | @@ -41,6 +41,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Type | |------|------| +| [aws_eks_access_entry.karpenter_nodes](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_access_entry) | resource | | [aws_iam_policy.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.spark](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_prometheus_workspace.amp](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/prometheus_workspace) | resource | @@ -54,7 +55,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [kubernetes_secret_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | | [kubernetes_service_account_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | | [kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource | -| [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/3.3.2/docs/resources/password) | resource | +| [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_ecrpublic_authorization_token.token](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ecrpublic_authorization_token) | data source | @@ -70,18 +71,17 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_auth\_roles](#input\_aws\_auth\_roles) | additional aws auth roles |
list(
object(
{
rolearn = string
username = string
groups = list(string
)
}
)
)
| `[]` | no | | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.30"` | no | -| [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
[
"100.64.0.0/17",
"100.64.128.0/17"
]
| no | +| [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
[
"100.64.0.0/17",
"100.64.128.0/17"
]
| no | | [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | | [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `true` | no | | [kms\_key\_admin\_roles](#input\_kms\_key\_admin\_roles) | list of role ARNs to add to the KMS policy | `list(string)` | `[]` | no | | [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"spark-operator-doeks"` | no | -| [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` |
[
"10.1.1.0/24",
"10.1.2.0/24"
]
| no | -| [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet/AZ | `list(string)` |
[
"10.1.0.0/26",
"10.1.0.64/26"
]
| no | +| [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` |
[
"10.1.1.0/24",
"10.1.2.0/24"
]
| no | +| [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet/AZ | `list(string)` |
[
"10.1.0.0/26",
"10.1.0.64/26"
]
| no | | [region](#input\_region) | Region | `string` | `"us-west-2"` | no | -| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
[
"100.64.0.0/16"
]
| no | +| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
[
"100.64.0.0/16"
]
| no | | [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/16"` | no | ## Outputs diff --git a/analytics/terraform/spark-k8s-operator/eks.tf b/analytics/terraform/spark-k8s-operator/eks.tf index 10ac43f8b..8f3376bd3 100644 --- a/analytics/terraform/spark-k8s-operator/eks.tf +++ b/analytics/terraform/spark-k8s-operator/eks.tf @@ -12,7 +12,7 @@ module "eks" { cluster_endpoint_public_access = true # Add the IAM identity that terraform is using as a cluster admin - authentication_mode = "API_AND_CONFIG_MAP" + authentication_mode = "API_AND_CONFIG_MAP" enable_cluster_creator_admin_permissions = true vpc_id = module.vpc.vpc_id diff --git a/analytics/terraform/spark-k8s-operator/variables.tf b/analytics/terraform/spark-k8s-operator/variables.tf index 07933a096..afbecc239 100644 --- a/analytics/terraform/spark-k8s-operator/variables.tf +++ b/analytics/terraform/spark-k8s-operator/variables.tf @@ -72,21 +72,6 @@ variable "enable_yunikorn" { type = bool } -variable "aws_auth_roles" { - description = "additional aws auth roles" - type = list( - object( - { - rolearn = string - username = string - groups = list(string - ) - } - ) - ) - default = [] -} - variable "kms_key_admin_roles" { description = "list of role ARNs to add to the KMS policy" type = list(string) From f02aa09a9903076f6cecd73fabb29064e5bcb1c6 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Thu, 31 Oct 2024 07:35:31 -0700 Subject: [PATCH 3/5] bump addon helm charts to latest --- analytics/terraform/spark-k8s-operator/addons.tf | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf index ac8fb485b..58b9372ac 100644 --- a/analytics/terraform/spark-k8s-operator/addons.tf +++ b/analytics/terraform/spark-k8s-operator/addons.tf @@ -405,6 +405,7 @@ module "eks_data_addons" { # Spark history server is required only when EMR Spark Operator is enabled enable_spark_history_server = true spark_history_server_helm_config = { + chart_version = "1.2.0" values = [ <<-EOT sparkHistoryOpts: "-Dspark.history.fs.logDirectory=s3a://${module.s3_bucket.s3_bucket_id}/${aws_s3_object.this.key}" @@ -417,6 +418,7 @@ module "eks_data_addons" { #--------------------------------------------------------------- enable_kubecost = true kubecost_helm_config = { + chart_version = "2.3.3" values = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})] repository_username = data.aws_ecrpublic_authorization_token.token.user_name repository_password = data.aws_ecrpublic_authorization_token.token.password @@ -476,6 +478,7 @@ module "eks_blueprints_addons" { #--------------------------------------- enable_metrics_server = true metrics_server = { + chart_version = "3.12.2" values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] } @@ -484,6 +487,7 @@ module "eks_blueprints_addons" { #--------------------------------------- enable_cluster_autoscaler = true cluster_autoscaler = { + chart_version = "9.43.1" values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", { aws_region = var.region, eks_cluster_id = module.eks.cluster_name @@ -511,6 +515,7 @@ module "eks_blueprints_addons" { #--------------------------------------- enable_aws_cloudwatch_metrics = true aws_cloudwatch_metrics = { + chart_version = "0.0.11" values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] } @@ -524,6 +529,7 @@ module "eks_blueprints_addons" { retention_in_days = 30 } aws_for_fluentbit = { + chart_version = "0.1.34" s3_bucket_arns = [ module.s3_bucket.s3_bucket_arn, "${module.s3_bucket.s3_bucket_arn}/*" @@ -538,7 +544,7 @@ module "eks_blueprints_addons" { enable_aws_load_balancer_controller = true aws_load_balancer_controller = { - chart_version = "1.5.4" + chart_version = "1.9.2" set = [{ name = "enableServiceMutatorWebhook" value = "false" @@ -547,7 +553,7 @@ module "eks_blueprints_addons" { enable_ingress_nginx = true ingress_nginx = { - version = "4.5.2" + version = "4.11.3" values = [templatefile("${path.module}/helm-values/nginx-values.yaml", {})] } @@ -571,7 +577,7 @@ module "eks_blueprints_addons" { amp_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}" }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {}) ] - chart_version = "48.1.1" + chart_version = "65.5.1" set_sensitive = [ { name = "grafana.adminPassword" From 3b3156dc90163cf82affd4c9b6a11ef60da21040 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Thu, 31 Oct 2024 07:36:32 -0700 Subject: [PATCH 4/5] include kubecost ecr images from upstream values --- .../helm-values/kubecost-values.yaml | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml b/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml index 98c2b8685..a1ec91220 100644 --- a/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml +++ b/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml @@ -25,8 +25,13 @@ prometheus: create: false kubeStateMetrics: enabled: false - -#imageVersion: prod-1.96.0 # commented to use the latest + server: + image: + repository: public.ecr.aws/kubecost/prometheus + configmapReload: + prometheus: + image: + repository: public.ecr.aws/kubecost/prometheus-config-reloader kubecostFrontend: image: public.ecr.aws/kubecost/frontend @@ -46,6 +51,18 @@ kubecostModel: cpu: "500m" memory: "512Mi" +forecasting: + fullImageName: public.ecr.aws/kubecost/kubecost-modeling:v0.1.18 + +networkCosts: + image: + repository: public.ecr.aws/kubecost/kubecost-network-costs + tag: v0.17.6 + +clusterController: + image: + repository: public.ecr.aws/kubecost/cluster-controller + # Set this to false if you're bringing your own service account. #serviceAccount: # create: false @@ -60,3 +77,6 @@ persistentVolume: enabled: true # Note that setting this to false means configurations will be wiped out on pod restart. storageClass: gp3 # existingClaim: kubecost-cost-analyzer # a claim in the same namespace as kubecost + +reporting: + productAnalytics: false \ No newline at end of file From e5c4e6fb935843469340541ce456a2343552d804 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Thu, 31 Oct 2024 07:39:59 -0700 Subject: [PATCH 5/5] pre-commit cleanup --- analytics/terraform/spark-k8s-operator/addons.tf | 6 +++--- .../spark-k8s-operator/helm-values/kubecost-values.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf index 58b9372ac..9f4afdfd4 100644 --- a/analytics/terraform/spark-k8s-operator/addons.tf +++ b/analytics/terraform/spark-k8s-operator/addons.tf @@ -418,7 +418,7 @@ module "eks_data_addons" { #--------------------------------------------------------------- enable_kubecost = true kubecost_helm_config = { - chart_version = "2.3.3" + chart_version = "2.3.3" values = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})] repository_username = data.aws_ecrpublic_authorization_token.token.user_name repository_password = data.aws_ecrpublic_authorization_token.token.password @@ -479,7 +479,7 @@ module "eks_blueprints_addons" { enable_metrics_server = true metrics_server = { chart_version = "3.12.2" - values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] + values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] } #--------------------------------------- @@ -516,7 +516,7 @@ module "eks_blueprints_addons" { enable_aws_cloudwatch_metrics = true aws_cloudwatch_metrics = { chart_version = "0.0.11" - values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] + values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] } #--------------------------------------- diff --git a/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml b/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml index a1ec91220..97f4a1984 100644 --- a/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml +++ b/analytics/terraform/spark-k8s-operator/helm-values/kubecost-values.yaml @@ -79,4 +79,4 @@ persistentVolume: # existingClaim: kubecost-cost-analyzer # a claim in the same namespace as kubecost reporting: - productAnalytics: false \ No newline at end of file + productAnalytics: false