Skip to content

Commit

Permalink
add 1K, 5K and 10K RayCluster/RayJob scalability tests (ray-project#2218
Browse files Browse the repository at this point in the history
)

Signed-off-by: Andrew Sy Kim <[email protected]>
  • Loading branch information
andrewsykim authored Sep 26, 2024
1 parent f69885b commit 9c37889
Show file tree
Hide file tree
Showing 29 changed files with 1,125 additions and 32 deletions.
3 changes: 2 additions & 1 deletion benchmark/perf-tests/100-raycluster/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ steps:
timeout: 30m
command:
- "bash"
- "100-raycluster/wait-for-rayclusters.sh"
- "common/wait-for-rayclusters.sh"
- "100"
- name: Wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Expand Down
3 changes: 2 additions & 1 deletion benchmark/perf-tests/100-rayjob/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ steps:
timeout: 30m
command:
- "bash"
- "100-rayjob/wait-for-rayjobs.sh"
- "common/wait-for-rayjobs.sh"
- "100"
- name: Wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Expand Down
28 changes: 0 additions & 28 deletions benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh

This file was deleted.

68 changes: 68 additions & 0 deletions benchmark/perf-tests/1000-raycluster/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: kuberay
namespace:
number: 100
tuningSets:
- name: Uniform100qps
qpsLoad:
qps: 100
steps:
- name: Start measurements
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 30m
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: start
apiVersion: ray.io/v1
kind: RayCluster
labelSelector: app.kubernetes.io/created-by = kuberay-operator
operationTimeout: 120s
- name: Preload Images
measurements:
- Identifier: PreloadImages
Method: Exec
Params:
timeout: 30m
command:
- "bash"
- "common/preload-image.sh"
- name: Creating Ray clusters
phases:
- namespaceRange:
min: 1
max: 100
replicasPerNamespace: 10
tuningSet: Uniform100qps
objectBundle:
- basename: raycluster
objectTemplatePath: raycluster.yaml
templateFillMap:
Replicas: 3
Image: "rayproject/ray:2.9.3"
- name: Wait for RayClusters ready
measurements:
- Identifier: WaitForRayCluster
Method: Exec
Params:
timeout: 30m
command:
- "bash"
- "common/wait-for-rayclusters.sh"
- "1000"
- name: Measure wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: gather
- name: Measure pod startup latency
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: gather
50 changes: 50 additions & 0 deletions benchmark/perf-tests/1000-raycluster/raycluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: {{.Name}}
labels:
perf-test: ray-cluster
spec:
rayVersion: '2.9.3'
headGroupSpec:
serviceType: ClusterIP
rayStartParams:
dashboard-host: '0.0.0.0'
disable-usage-stats: 'true'
template:
spec:
containers:
- name: ray-head
image: {{.Image}}
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: "1"
requests:
cpu: "10m"
volumes:
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
- replicas: {{.Replicas}}
minReplicas: 1
maxReplicas: 10
# logical group name, for this called small-group, also can be functional
groupName: small-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: {{.Image}}
resources:
limits:
cpu: "1"
requests:
cpu: "10m"
10 changes: 10 additions & 0 deletions benchmark/perf-tests/1000-raycluster/results/junit.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="1644.719">
<testcase name="kuberay overall (1000-raycluster/config.yaml)" classname="ClusterLoaderV2" time="647.5399098"/>
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.105058303"/>
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="1.006024017"/>
<testcase name="kuberay: [step: 02] Preload Images [00] - PreloadImages" classname="ClusterLoaderV2" time="309.500836837"/>
<testcase name="kuberay: [step: 03] Creating Ray clusters" classname="ClusterLoaderV2" time="10.622250764"/>
<testcase name="kuberay: [step: 04] Wait for RayClusters ready [00] - WaitForRayCluster" classname="ClusterLoaderV2" time="258.283033377"/>
<testcase name="kuberay: [step: 05] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.491021323"/>
<testcase name="kuberay: [step: 06] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="1.513658548"/>
</testsuite>
83 changes: 83 additions & 0 deletions benchmark/perf-tests/1000-rayjob/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: kuberay
namespace:
number: 100
tuningSets:
- name: Uniform100qps
qpsLoad:
qps: 100
steps:
- name: Start measurements
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 30m
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: start
apiVersion: ray.io/v1
kind: RayCluster
labelSelector: app.kubernetes.io/created-by = kuberay-operator
operationTimeout: 120s
- Identifier: JobLifecycleLatency
Method: JobLifecycleLatency
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 10m
- name: Creating RayJobs for PyTorch MNIST fine-tuning
phases:
- namespaceRange:
min: 1
max: 100
replicasPerNamespace: 5
tuningSet: Uniform100qps
objectBundle:
- basename: pytorch-mnist
objectTemplatePath: pytorch-mnist-rayjob.yaml
templateFillMap:
Image: "rayproject/ray:2.9.3"
- name: Creating RayJobs for Ray Data Image Resizing
phases:
- namespaceRange:
min: 1
max: 100
replicasPerNamespace: 5
tuningSet: Uniform100qps
objectBundle:
- basename: ray-data-image-resize
objectTemplatePath: ray-data-image-resize.yaml
templateFillMap:
Image: "rayproject/ray:2.9.3"
- name: Wait for RayJobs complete
measurements:
- Identifier: WaitForRayJob
Method: Exec
Params:
timeout: 30m
command:
- "bash"
- "common/wait-for-rayjobs.sh"
- "500" # 1000 since we deploy two RayJobs with 500 instances each
- name: Measure wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: gather
operationTimeout: 10m
- name: Measure pod startup latency
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: gather
- name: Measure job finished
measurements:
- Identifier: JobLifecycleLatency
Method: JobLifecycleLatency
Params:
action: gather
63 changes: 63 additions & 0 deletions benchmark/perf-tests/1000-rayjob/pytorch-mnist-rayjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: {{.Name}}
labels:
perf-test: rayjob-pytorch-mnist
spec:
shutdownAfterJobFinishes: true
entrypoint: python ray_train_pytorch_mnist.py
submitterPodTemplate:
spec:
restartPolicy: Never
containers:
- name: submitter-job
image: {{.Image}}
command:
- "sh"
- "-c"
args:
- |
#!/bin/sh
ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"NUM_WORKERS":"2","CPUS_PER_WORKER":"1","OMP_NUM_THREADS":"1"}}' -- python ray_train_pytorch_mnist.py
resources:
requests:
cpu: "10m"
rayClusterSpec:
rayVersion: '2.9.3'
headGroupSpec:
rayStartParams:
disable-usage-stats: 'true'
template:
spec:
containers:
- name: ray-head
image: {{.Image}}
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
requests:
cpu: "100m"
memory: "4Gi"
workerGroupSpecs:
- replicas: 2
minReplicas: 1
maxReplicas: 5
groupName: worker-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: {{.Image}}
resources:
requests:
cpu: "100m"
memory: "4Gi"
63 changes: 63 additions & 0 deletions benchmark/perf-tests/1000-rayjob/ray-data-image-resize.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: {{.Name}}
labels:
perf-test: ray-data-image-resize
spec:
shutdownAfterJobFinishes: true
entrypoint: python ray_data_image_resize.py
submitterPodTemplate:
spec:
restartPolicy: Never
containers:
- name: submitter-job
image: {{.Image}}
command:
- "sh"
- "-c"
args:
- |
#!/bin/sh
ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"BUCKET_NAME":"ray-images","BUCKET_PREFIX":"images"}}' -- python ray_data_image_resize.py
resources:
requests:
cpu: "10m"
rayClusterSpec:
rayVersion: '2.9.3'
headGroupSpec:
rayStartParams:
disable-usage-stats: 'true'
template:
spec:
containers:
- name: ray-head
image: {{.Image}}
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
requests:
cpu: "100m"
memory: "2Gi"
workerGroupSpecs:
- replicas: 2
minReplicas: 1
maxReplicas: 5
groupName: worker-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: {{.Image}}
resources:
requests:
cpu: "100m"
memory: "2Gi"
Loading

0 comments on commit 9c37889

Please sign in to comment.