Skip to content

Commit

Permalink
add spark benchmark test data generation changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ratnopamc committed Nov 1, 2024
1 parent a7b2d9a commit 8a367fe
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 47 deletions.
4 changes: 2 additions & 2 deletions analytics/terraform/spark-k8s-operator/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,9 @@ module "eks" {
# Node group will be created with zero instances when you deploy the blueprint.
# You can change the min_size and desired_size to 6 instances
# desired_size might not be applied through terrafrom once the node group is created so this needs to be adjusted in AWS Console.
min_size = 0 # Change min and desired to 6 for running benchmarks
min_size = var.spark_benchmark_ssd_min_size # Change min and desired to 6 for running benchmarks
max_size = 8
desired_size = 0 # Change min and desired to 6 for running benchmarks
desired_size = var.spark_benchmark_ssd_desired_size # Change min and desired to 6 for running benchmarks

instance_types = ["c5d.12xlarge"] # c5d.12xlarge = 2 x 900 NVMe SSD

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# NOTE: This example requires the following prerequisites before executing the jobs
# 1. Ensure spark-team-a name space exists
# 2. replace <S3_BUCKET> with your bucket name
# 2. replace spark-operator-doeks-spark-logs-20241031201012190500000003 with your bucket name

---
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
Expand All @@ -21,22 +20,22 @@ spec:
mainClass: com.amazonaws.eks.tpcds.DataGeneration
mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
arguments:
# TPC-DS data location
- "s3a://<S3_BUCKET>/TPCDS-TEST-1TB"
# Path to kit in the docker image
- "/opt/tpcds-kit/tools"
# Data Format
- "parquet"
# Scale factor (in GB) - S3 output size shows 309.4GB for 1000GB Input
- "1000"
# Generate data num partitions
- "200"
# Create the partitioned fact tables
- "true"
# Shuffle to get partitions coalesced into single files.
- "true"
# Logging set to WARN
- "true"
# TPC-DS data location
- "s3a://spark-operator-doeks-spark-logs-20241031201012190500000003/TPCDS-TEST-1TB"
# Path to kit in the docker image
- "/opt/tpcds-kit/tools"
# Data Format
- "parquet"
# Scale factor (in GB) - S3 output size shows 309.4GB for 1000GB Input
- "1000"
# Generate data num partitions
- "200"
# Create the partitioned fact tables
- "true"
# Shuffle to get partitions coalesced into single files.
- "true"
# Logging set to WARN
- "true"
sparkConf:
"spark.executorEnv.JAVA_HOME": "/opt/java/openjdk"
"spark.driverEnv.JAVA_HOME": "/opt/java/openjdk"
Expand Down Expand Up @@ -69,7 +68,7 @@ spec:
"spark.hadoop.fs.s3a.connection.maximum": "200"
"spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
"spark.kubernetes.executor.podNamePrefix": "oss-data-gen"
"spark.sql.shuffle.partitions": "2000" # Adjust according to your job size
"spark.sql.shuffle.partitions": "2000" # Adjust according to your job size
# "spark.hadoop.fs.s3a.committer.staging.conflict-mode": "append"
# Data writing and shuffle tuning
"spark.shuffle.file.buffer": "1m"
Expand Down Expand Up @@ -98,47 +97,47 @@ spec:
securityContext:
runAsUser: 185
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
- name: spark-local-dir-1
mountPath: /data1
env:
- name: JAVA_HOME
value: "/opt/java/openjdk"
- name: JAVA_HOME
value: "/opt/java/openjdk"
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
nodeSelector:
NodeGroupType: SparkComputeOptimized
NodeGroupType: spark_benchmark_ssd
executor:
cores: 11
# The maximum memory size of the container to the running executor is determined by the sum of
# spark.executor.memoryoverHead, spark.executor.memory, spark.memory.offHeap.size, spark.executor.pyspark.memory
memory: "15g"
memoryOverhead: "4g"
instances: 26
instances: 22
serviceAccount: spark-team-a
securityContext:
runAsUser: 185
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
env:
- name: JAVA_HOME
value: "/opt/java/openjdk"
- name: JAVA_HOME
value: "/opt/java/openjdk"
nodeSelector:
NodeGroupType: SparkComputeOptimized
NodeGroupType: spark_benchmark_ssd
volumes:
- name: spark-local-dir-1
hostPath:
path: "/mnt/k8s-disks/0"
type: DirectoryOrCreate
- name: spark-local-dir-1
hostPath:
path: "/mnt/k8s-disks/0"
type: DirectoryOrCreate
12 changes: 12 additions & 0 deletions analytics/terraform/spark-k8s-operator/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,15 @@ variable "kms_key_admin_roles" {
type = list(string)
default = []
}

variable "spark_benchmark_ssd_min_size" {
description = "Minimum size for nodegroup of c5d 12xlarge instances to run data genereation for Spark benchmark"

Check failure on line 82 in analytics/terraform/spark-k8s-operator/variables.tf

View workflow job for this annotation

GitHub Actions / Check for spelling errors

genereation ==> generation
type = number
default = 0
}

variable "spark_benchmark_ssd_desired_size" {
description = "Desired size for nodegroup of c5d 12xlarge instances to run data genereation for Spark benchmark"

Check failure on line 88 in analytics/terraform/spark-k8s-operator/variables.tf

View workflow job for this annotation

GitHub Actions / Check for spelling errors

genereation ==> generation
type = number
default = 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"label": "Spark Operator Benchmarks on EKS",
"position": 2,
"link": {
"type": "generated-index"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
sidebar_position: 2
sidebar_label: Data Generation for Spark Operator Benchmark Test
---

# Data Generation for Running Spark Benchmark Tests on Amazon EKS

The following guide




Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
sidebar_position: 1
sidebar_label: Spark Operator Benchmark on EKS
---

# Spark Benchmark Tests on Amazon EKS 🚀

This guide walks you through running Apache Spark benchmark tests on Amazon EKS, AWS's managed Kubernetes service. Benchmark tests help evaluate and optimize Spark workloads on EKS comparing benchmark results run across different EC2 instance families of Graviton instances, especially when scaling for performance, cost efficiency, and reliability.
Key Features 📈

- Data Generation for the benchmark tests
- Benchmark Test Execution on Different generation of Graviton Instances (r6g, r7g, r8g)
- Benchmark Results
- Customizable Benchmarks to suit your workloads
- Autoscaling and Cost Optimization Strategies

0 comments on commit 8a367fe

Please sign in to comment.