add 1K, 5K and 10K RayCluster/RayJob scalability tests (ray-project#2218

) Signed-off-by: Andrew Sy Kim <[email protected]>
MortalHappiness · Sep 26, 2024 · 9c37889 · 9c37889
1 parent f69885b
commit 9c37889
Show file tree

Hide file tree

Showing 29 changed files with 1,125 additions and 32 deletions.
diff --git a/benchmark/perf-tests/100-raycluster/config.yaml b/benchmark/perf-tests/100-raycluster/config.yaml
@@ -42,7 +42,8 @@ steps:
       timeout: 30m
       command:
       - "bash"
-      - "100-raycluster/wait-for-rayclusters.sh"
+      - "common/wait-for-rayclusters.sh"
+      - "100"
 - name: Wait for pods to be running
   measurements:
   - Identifier: WaitForControlledPodsRunning

diff --git a/benchmark/perf-tests/100-rayjob/config.yaml b/benchmark/perf-tests/100-rayjob/config.yaml
@@ -60,7 +60,8 @@ steps:
       timeout: 30m
       command:
       - "bash"
-      - "100-rayjob/wait-for-rayjobs.sh"
+      - "common/wait-for-rayjobs.sh"
+      - "100"
 - name: Wait for pods to be running
   measurements:
   - Identifier: WaitForControlledPodsRunning

diff --git a/benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh b/benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh
diff --git a/benchmark/perf-tests/1000-raycluster/config.yaml b/benchmark/perf-tests/1000-raycluster/config.yaml
@@ -0,0 +1,68 @@
+name: kuberay
+namespace:
+  number: 100
+tuningSets:
+- name: Uniform100qps
+  qpsLoad:
+    qps: 100
+steps:
+- name: Start measurements
+  measurements:
+  - Identifier: PodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: start
+      labelSelector: app.kubernetes.io/created-by = kuberay-operator
+      threshold: 30m
+  - Identifier: WaitForControlledPodsRunning
+    Method: WaitForControlledPodsRunning
+    Params:
+      action: start
+      apiVersion: ray.io/v1
+      kind: RayCluster
+      labelSelector: app.kubernetes.io/created-by = kuberay-operator
+      operationTimeout: 120s
+- name: Preload Images
+  measurements:
+  - Identifier: PreloadImages
+    Method: Exec
+    Params:
+      timeout: 30m
+      command:
+      - "bash"
+      - "common/preload-image.sh"
+- name: Creating Ray clusters
+  phases:
+  - namespaceRange:
+      min: 1
+      max: 100
+    replicasPerNamespace: 10
+    tuningSet: Uniform100qps
+    objectBundle:
+    - basename: raycluster
+      objectTemplatePath: raycluster.yaml
+      templateFillMap:
+        Replicas: 3
+        Image: "rayproject/ray:2.9.3"
+- name: Wait for RayClusters ready
+  measurements:
+  - Identifier: WaitForRayCluster
+    Method: Exec
+    Params:
+      timeout: 30m
+      command:
+      - "bash"
+      - "common/wait-for-rayclusters.sh"
+      - "1000"
+- name: Measure wait for pods to be running
+  measurements:
+  - Identifier: WaitForControlledPodsRunning
+    Method: WaitForControlledPodsRunning
+    Params:
+      action: gather
+- name: Measure pod startup latency
+  measurements:
+  - Identifier: PodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: gather
diff --git a/benchmark/perf-tests/1000-raycluster/raycluster.yaml b/benchmark/perf-tests/1000-raycluster/raycluster.yaml
@@ -0,0 +1,50 @@
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: {{.Name}}
+  labels:
+    perf-test: ray-cluster
+spec:
+  rayVersion: '2.9.3'
+  headGroupSpec:
+    serviceType: ClusterIP
+    rayStartParams:
+      dashboard-host: '0.0.0.0'
+      disable-usage-stats: 'true'
+    template:
+      spec:
+        containers:
+        - name: ray-head
+          image: {{.Image}}
+          ports:
+          - containerPort: 6379
+            name: gcs
+          - containerPort: 8265
+            name: dashboard
+          - containerPort: 10001
+            name: client
+          resources:
+            limits:
+              cpu: "1"
+            requests:
+              cpu: "10m"
+        volumes:
+          - name: ray-logs
+            emptyDir: {}
+  workerGroupSpecs:
+  - replicas: {{.Replicas}}
+    minReplicas: 1
+    maxReplicas: 10
+    # logical group name, for this called small-group, also can be functional
+    groupName: small-group
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+        - name: ray-worker
+          image: {{.Image}}
+          resources:
+            limits:
+              cpu: "1"
+            requests:
+              cpu: "10m"
diff --git a/benchmark/perf-tests/1000-raycluster/results/junit.xml b/benchmark/perf-tests/1000-raycluster/results/junit.xml
@@ -0,0 +1,10 @@
+<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="1644.719">
+    <testcase name="kuberay overall (1000-raycluster/config.yaml)" classname="ClusterLoaderV2" time="647.5399098"/>
+    <testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.105058303"/>
+    <testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="1.006024017"/>
+    <testcase name="kuberay: [step: 02] Preload Images [00] - PreloadImages" classname="ClusterLoaderV2" time="309.500836837"/>
+    <testcase name="kuberay: [step: 03] Creating Ray clusters" classname="ClusterLoaderV2" time="10.622250764"/>
+    <testcase name="kuberay: [step: 04] Wait for RayClusters ready [00] - WaitForRayCluster" classname="ClusterLoaderV2" time="258.283033377"/>
+    <testcase name="kuberay: [step: 05] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.491021323"/>
+    <testcase name="kuberay: [step: 06] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="1.513658548"/>
+</testsuite>
diff --git a/benchmark/perf-tests/1000-rayjob/config.yaml b/benchmark/perf-tests/1000-rayjob/config.yaml
@@ -0,0 +1,83 @@
+name: kuberay
+namespace:
+  number: 100
+tuningSets:
+- name: Uniform100qps
+  qpsLoad:
+    qps: 100
+steps:
+- name: Start measurements
+  measurements:
+  - Identifier: PodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: start
+      labelSelector: app.kubernetes.io/created-by = kuberay-operator
+      threshold: 30m
+  - Identifier: WaitForControlledPodsRunning
+    Method: WaitForControlledPodsRunning
+    Params:
+      action: start
+      apiVersion: ray.io/v1
+      kind: RayCluster
+      labelSelector: app.kubernetes.io/created-by = kuberay-operator
+      operationTimeout: 120s
+  - Identifier: JobLifecycleLatency
+    Method: JobLifecycleLatency
+    Params:
+      action: start
+      labelSelector: app.kubernetes.io/created-by = kuberay-operator
+      threshold: 10m
+- name: Creating RayJobs for PyTorch MNIST fine-tuning
+  phases:
+  - namespaceRange:
+      min: 1
+      max: 100
+    replicasPerNamespace: 5
+    tuningSet: Uniform100qps
+    objectBundle:
+    - basename: pytorch-mnist
+      objectTemplatePath: pytorch-mnist-rayjob.yaml
+      templateFillMap:
+        Image: "rayproject/ray:2.9.3"
+- name: Creating RayJobs for Ray Data Image Resizing
+  phases:
+  - namespaceRange:
+      min: 1
+      max: 100
+    replicasPerNamespace: 5
+    tuningSet: Uniform100qps
+    objectBundle:
+    - basename: ray-data-image-resize
+      objectTemplatePath: ray-data-image-resize.yaml
+      templateFillMap:
+        Image: "rayproject/ray:2.9.3"
+- name: Wait for RayJobs complete
+  measurements:
+  - Identifier: WaitForRayJob
+    Method: Exec
+    Params:
+      timeout: 30m
+      command:
+      - "bash"
+      - "common/wait-for-rayjobs.sh"
+      - "500" # 1000 since we deploy two RayJobs with 500 instances each
+- name: Measure wait for pods to be running
+  measurements:
+  - Identifier: WaitForControlledPodsRunning
+    Method: WaitForControlledPodsRunning
+    Params:
+      action: gather
+      operationTimeout: 10m
+- name: Measure pod startup latency
+  measurements:
+  - Identifier: PodStartupLatency
+    Method: PodStartupLatency
+    Params:
+      action: gather
+- name: Measure job finished
+  measurements:
+  - Identifier: JobLifecycleLatency
+    Method: JobLifecycleLatency
+    Params:
+      action: gather
diff --git a/benchmark/perf-tests/1000-rayjob/pytorch-mnist-rayjob.yaml b/benchmark/perf-tests/1000-rayjob/pytorch-mnist-rayjob.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+  name: {{.Name}}
+  labels:
+    perf-test: rayjob-pytorch-mnist
+spec:
+  shutdownAfterJobFinishes: true
+  entrypoint: python ray_train_pytorch_mnist.py
+  submitterPodTemplate:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: submitter-job
+          image: {{.Image}}
+          command:
+          - "sh"
+          - "-c"
+          args:
+          - |
+            #!/bin/sh
+
+            ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+            ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"NUM_WORKERS":"2","CPUS_PER_WORKER":"1","OMP_NUM_THREADS":"1"}}' -- python ray_train_pytorch_mnist.py
+          resources:
+            requests:
+              cpu: "10m"
+  rayClusterSpec:
+    rayVersion: '2.9.3'
+    headGroupSpec:
+      rayStartParams:
+        disable-usage-stats: 'true'
+      template:
+        spec:
+          containers:
+            - name: ray-head
+              image: {{.Image}}
+              ports:
+                - containerPort: 6379
+                  name: gcs-server
+                - containerPort: 8265
+                  name: dashboard
+                - containerPort: 10001
+                  name: client
+              resources:
+                requests:
+                  cpu: "100m"
+                  memory: "4Gi"
+    workerGroupSpecs:
+      - replicas: 2
+        minReplicas: 1
+        maxReplicas: 5
+        groupName: worker-group
+        rayStartParams: {}
+        template:
+          spec:
+            containers:
+              - name: ray-worker
+                image: {{.Image}}
+                resources:
+                  requests:
+                    cpu: "100m"
+                    memory: "4Gi"
diff --git a/benchmark/perf-tests/1000-rayjob/ray-data-image-resize.yaml b/benchmark/perf-tests/1000-rayjob/ray-data-image-resize.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+  name: {{.Name}}
+  labels:
+    perf-test: ray-data-image-resize
+spec:
+  shutdownAfterJobFinishes: true
+  entrypoint: python ray_data_image_resize.py
+  submitterPodTemplate:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: submitter-job
+          image: {{.Image}}
+          command:
+          - "sh"
+          - "-c"
+          args:
+          - |
+            #!/bin/sh
+
+            ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+            ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"BUCKET_NAME":"ray-images","BUCKET_PREFIX":"images"}}' -- python ray_data_image_resize.py
+          resources:
+            requests:
+              cpu: "10m"
+  rayClusterSpec:
+    rayVersion: '2.9.3'
+    headGroupSpec:
+      rayStartParams:
+        disable-usage-stats: 'true'
+      template:
+        spec:
+          containers:
+            - name: ray-head
+              image: {{.Image}}
+              ports:
+                - containerPort: 6379
+                  name: gcs-server
+                - containerPort: 8265
+                  name: dashboard
+                - containerPort: 10001
+                  name: client
+              resources:
+                requests:
+                  cpu: "100m"
+                  memory: "2Gi"
+    workerGroupSpecs:
+      - replicas: 2
+        minReplicas: 1
+        maxReplicas: 5
+        groupName: worker-group
+        rayStartParams: {}
+        template:
+          spec:
+            containers:
+              - name: ray-worker
+                image: {{.Image}}
+                resources:
+                  requests:
+                    cpu: "100m"
+                    memory: "2Gi"