add spark benchmark test data generation changes

awslabs · Nov 1, 2024 · 8a367fe · 8a367fe
1 parent a7b2d9a
commit 8a367fe
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 47 deletions.
diff --git a/analytics/terraform/spark-k8s-operator/eks.tf b/analytics/terraform/spark-k8s-operator/eks.tf
@@ -175,9 +175,9 @@ module "eks" {
       # Node group will be created with zero instances when you deploy the blueprint.
       # You can change the min_size and desired_size to 6 instances
       # desired_size might not be applied through terrafrom once the node group is created so this needs to be adjusted in AWS Console.
-      min_size     = 0 # Change min and desired to 6 for running benchmarks
+      min_size     = var.spark_benchmark_ssd_min_size # Change min and desired to 6 for running benchmarks
       max_size     = 8
-      desired_size = 0 # Change min and desired to 6 for running benchmarks
+      desired_size = var.spark_benchmark_ssd_desired_size # Change min and desired to 6 for running benchmarks
 
       instance_types = ["c5d.12xlarge"] # c5d.12xlarge = 2 x 900 NVMe SSD
 

diff --git a/...s/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml b/...s/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml
@@ -1,8 +1,7 @@
 # NOTE: This example requires the following prerequisites before executing the jobs
 # 1. Ensure spark-team-a name space exists
-# 2. replace <S3_BUCKET>  with your bucket name
+# 2. replace spark-operator-doeks-spark-logs-20241031201012190500000003  with your bucket name
 
----
 apiVersion: "sparkoperator.k8s.io/v1beta2"
 kind: SparkApplication
 metadata:
@@ -21,22 +20,22 @@ spec:
   mainClass: com.amazonaws.eks.tpcds.DataGeneration
   mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
   arguments:
-    # TPC-DS data location
-    - "s3a://<S3_BUCKET>/TPCDS-TEST-1TB"
-    # Path to kit in the docker image
-    - "/opt/tpcds-kit/tools"
-    # Data Format
-    - "parquet"
-    # Scale factor (in GB) - S3 output size shows  309.4GB for 1000GB Input
-    - "1000"
-    # Generate data num partitions
-    - "200"
-    # Create the partitioned fact tables
-    - "true"
-    # Shuffle to get partitions coalesced into single files.
-    - "true"
-    # Logging set to WARN
-    - "true"
+  # TPC-DS data location
+  - "s3a://spark-operator-doeks-spark-logs-20241031201012190500000003/TPCDS-TEST-1TB"
+  # Path to kit in the docker image
+  - "/opt/tpcds-kit/tools"
+  # Data Format
+  - "parquet"
+  # Scale factor (in GB) - S3 output size shows  309.4GB for 1000GB Input
+  - "1000"
+  # Generate data num partitions
+  - "200"
+  # Create the partitioned fact tables
+  - "true"
+  # Shuffle to get partitions coalesced into single files.
+  - "true"
+  # Logging set to WARN
+  - "true"
   sparkConf:
     "spark.executorEnv.JAVA_HOME": "/opt/java/openjdk"
     "spark.driverEnv.JAVA_HOME": "/opt/java/openjdk"
@@ -69,7 +68,7 @@ spec:
     "spark.hadoop.fs.s3a.connection.maximum": "200"
     "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
     "spark.kubernetes.executor.podNamePrefix": "oss-data-gen"
-    "spark.sql.shuffle.partitions": "2000"  # Adjust according to your job size
+    "spark.sql.shuffle.partitions": "2000" # Adjust according to your job size
     # "spark.hadoop.fs.s3a.committer.staging.conflict-mode": "append"
     # Data writing and shuffle tuning
     "spark.shuffle.file.buffer": "1m"
@@ -98,47 +97,47 @@ spec:
     securityContext:
       runAsUser: 185
     volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /data1
+    - name: spark-local-dir-1
+      mountPath: /data1
     env:
-      - name: JAVA_HOME
-        value: "/opt/java/openjdk"
+    - name: JAVA_HOME
+      value: "/opt/java/openjdk"
     initContainers:
-      - name: volume-permission
-        image: public.ecr.aws/docker/library/busybox
-        command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
-        volumeMounts:
-          - name: spark-local-dir-1
-            mountPath: /data1
+    - name: volume-permission
+      image: public.ecr.aws/docker/library/busybox
+      command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
+      volumeMounts:
+      - name: spark-local-dir-1
+        mountPath: /data1
     nodeSelector:
-      NodeGroupType: SparkComputeOptimized
+      NodeGroupType: spark_benchmark_ssd
   executor:
     cores: 11
     # The maximum memory size of the container to the running executor is determined by the sum of
     #  spark.executor.memoryoverHead, spark.executor.memory, spark.memory.offHeap.size, spark.executor.pyspark.memory
     memory: "15g"
     memoryOverhead: "4g"
-    instances: 26
+    instances: 22
     serviceAccount: spark-team-a
     securityContext:
       runAsUser: 185
     volumeMounts:
+    - name: spark-local-dir-1
+      mountPath: /data1
+    initContainers:
+    - name: volume-permission
+      image: public.ecr.aws/docker/library/busybox
+      command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
+      volumeMounts:
       - name: spark-local-dir-1
         mountPath: /data1
-    initContainers:
-      - name: volume-permission
-        image: public.ecr.aws/docker/library/busybox
-        command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
-        volumeMounts:
-          - name: spark-local-dir-1
-            mountPath: /data1
     env:
-      - name: JAVA_HOME
-        value: "/opt/java/openjdk"
+    - name: JAVA_HOME
+      value: "/opt/java/openjdk"
     nodeSelector:
-      NodeGroupType: SparkComputeOptimized
+      NodeGroupType: spark_benchmark_ssd
   volumes:
-    - name: spark-local-dir-1
-      hostPath:
-        path: "/mnt/k8s-disks/0"
-        type: DirectoryOrCreate
+  - name: spark-local-dir-1
+    hostPath:
+      path: "/mnt/k8s-disks/0"
+      type: DirectoryOrCreate
diff --git a/analytics/terraform/spark-k8s-operator/variables.tf b/analytics/terraform/spark-k8s-operator/variables.tf
@@ -77,3 +77,15 @@ variable "kms_key_admin_roles" {
   type        = list(string)
   default     = []
 }
+
+variable "spark_benchmark_ssd_min_size" {
+  description = "Minimum size for nodegroup of c5d 12xlarge instances to run data genereation for Spark benchmark"
+  type        = number
+  default     = 0
+}
+
+variable "spark_benchmark_ssd_desired_size" {
+  description = "Desired size for nodegroup of c5d 12xlarge instances to run data genereation for Spark benchmark"
+  type        = number
+  default     = 0
+}
diff --git a/website/docs/benchmarks/spark-operator-benchmark/_category_.json b/website/docs/benchmarks/spark-operator-benchmark/_category_.json
@@ -0,0 +1,7 @@
+{
+  "label": "Spark Operator Benchmarks on EKS",
+  "position": 2,
+  "link": {
+    "type": "generated-index"
+  }
+}
diff --git a/website/docs/benchmarks/spark-operator-benchmark/data-generation.md b/website/docs/benchmarks/spark-operator-benchmark/data-generation.md
@@ -0,0 +1,12 @@
+---
+sidebar_position: 2
+sidebar_label: Data Generation for Spark Operator Benchmark Test
+---
+
+# Data Generation for Running Spark Benchmark Tests on Amazon EKS
+
+The following guide 
+
+
+
+
diff --git a/website/docs/benchmarks/spark-operator-benchmark/spark-operator-eks-benchmark.md b/website/docs/benchmarks/spark-operator-benchmark/spark-operator-eks-benchmark.md
@@ -0,0 +1,16 @@
+---
+sidebar_position: 1
+sidebar_label: Spark Operator Benchmark on EKS
+---
+
+# Spark Benchmark Tests on Amazon EKS 🚀
+
+This guide walks you through running Apache Spark benchmark tests on Amazon EKS, AWS's managed Kubernetes service. Benchmark tests help evaluate and optimize Spark workloads on EKS comparing benchmark results run across different EC2 instance families of Graviton instances, especially when scaling for performance, cost efficiency, and reliability.
+Key Features 📈
+
+- Data Generation for the benchmark tests
+- Benchmark Test Execution on Different generation of Graviton Instances (r6g, r7g, r8g)
+- Benchmark Results
+- Customizable Benchmarks to suit your workloads
+- Autoscaling and Cost Optimization Strategies
+