forked from ray-project/kuberay
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add 1K, 5K and 10K RayCluster/RayJob scalability tests (ray-project#2218
) Signed-off-by: Andrew Sy Kim <[email protected]>
- Loading branch information
1 parent
f69885b
commit 9c37889
Showing
29 changed files
with
1,125 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
name: kuberay | ||
namespace: | ||
number: 100 | ||
tuningSets: | ||
- name: Uniform100qps | ||
qpsLoad: | ||
qps: 100 | ||
steps: | ||
- name: Start measurements | ||
measurements: | ||
- Identifier: PodStartupLatency | ||
Method: PodStartupLatency | ||
Params: | ||
action: start | ||
labelSelector: app.kubernetes.io/created-by = kuberay-operator | ||
threshold: 30m | ||
- Identifier: WaitForControlledPodsRunning | ||
Method: WaitForControlledPodsRunning | ||
Params: | ||
action: start | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
labelSelector: app.kubernetes.io/created-by = kuberay-operator | ||
operationTimeout: 120s | ||
- name: Preload Images | ||
measurements: | ||
- Identifier: PreloadImages | ||
Method: Exec | ||
Params: | ||
timeout: 30m | ||
command: | ||
- "bash" | ||
- "common/preload-image.sh" | ||
- name: Creating Ray clusters | ||
phases: | ||
- namespaceRange: | ||
min: 1 | ||
max: 100 | ||
replicasPerNamespace: 10 | ||
tuningSet: Uniform100qps | ||
objectBundle: | ||
- basename: raycluster | ||
objectTemplatePath: raycluster.yaml | ||
templateFillMap: | ||
Replicas: 3 | ||
Image: "rayproject/ray:2.9.3" | ||
- name: Wait for RayClusters ready | ||
measurements: | ||
- Identifier: WaitForRayCluster | ||
Method: Exec | ||
Params: | ||
timeout: 30m | ||
command: | ||
- "bash" | ||
- "common/wait-for-rayclusters.sh" | ||
- "1000" | ||
- name: Measure wait for pods to be running | ||
measurements: | ||
- Identifier: WaitForControlledPodsRunning | ||
Method: WaitForControlledPodsRunning | ||
Params: | ||
action: gather | ||
- name: Measure pod startup latency | ||
measurements: | ||
- Identifier: PodStartupLatency | ||
Method: PodStartupLatency | ||
Params: | ||
action: gather |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
metadata: | ||
name: {{.Name}} | ||
labels: | ||
perf-test: ray-cluster | ||
spec: | ||
rayVersion: '2.9.3' | ||
headGroupSpec: | ||
serviceType: ClusterIP | ||
rayStartParams: | ||
dashboard-host: '0.0.0.0' | ||
disable-usage-stats: 'true' | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: {{.Image}} | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs | ||
- containerPort: 8265 | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
resources: | ||
limits: | ||
cpu: "1" | ||
requests: | ||
cpu: "10m" | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} | ||
workerGroupSpecs: | ||
- replicas: {{.Replicas}} | ||
minReplicas: 1 | ||
maxReplicas: 10 | ||
# logical group name, for this called small-group, also can be functional | ||
groupName: small-group | ||
rayStartParams: {} | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: {{.Image}} | ||
resources: | ||
limits: | ||
cpu: "1" | ||
requests: | ||
cpu: "10m" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="1644.719"> | ||
<testcase name="kuberay overall (1000-raycluster/config.yaml)" classname="ClusterLoaderV2" time="647.5399098"/> | ||
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.105058303"/> | ||
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="1.006024017"/> | ||
<testcase name="kuberay: [step: 02] Preload Images [00] - PreloadImages" classname="ClusterLoaderV2" time="309.500836837"/> | ||
<testcase name="kuberay: [step: 03] Creating Ray clusters" classname="ClusterLoaderV2" time="10.622250764"/> | ||
<testcase name="kuberay: [step: 04] Wait for RayClusters ready [00] - WaitForRayCluster" classname="ClusterLoaderV2" time="258.283033377"/> | ||
<testcase name="kuberay: [step: 05] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.491021323"/> | ||
<testcase name="kuberay: [step: 06] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="1.513658548"/> | ||
</testsuite> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
name: kuberay | ||
namespace: | ||
number: 100 | ||
tuningSets: | ||
- name: Uniform100qps | ||
qpsLoad: | ||
qps: 100 | ||
steps: | ||
- name: Start measurements | ||
measurements: | ||
- Identifier: PodStartupLatency | ||
Method: PodStartupLatency | ||
Params: | ||
action: start | ||
labelSelector: app.kubernetes.io/created-by = kuberay-operator | ||
threshold: 30m | ||
- Identifier: WaitForControlledPodsRunning | ||
Method: WaitForControlledPodsRunning | ||
Params: | ||
action: start | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
labelSelector: app.kubernetes.io/created-by = kuberay-operator | ||
operationTimeout: 120s | ||
- Identifier: JobLifecycleLatency | ||
Method: JobLifecycleLatency | ||
Params: | ||
action: start | ||
labelSelector: app.kubernetes.io/created-by = kuberay-operator | ||
threshold: 10m | ||
- name: Creating RayJobs for PyTorch MNIST fine-tuning | ||
phases: | ||
- namespaceRange: | ||
min: 1 | ||
max: 100 | ||
replicasPerNamespace: 5 | ||
tuningSet: Uniform100qps | ||
objectBundle: | ||
- basename: pytorch-mnist | ||
objectTemplatePath: pytorch-mnist-rayjob.yaml | ||
templateFillMap: | ||
Image: "rayproject/ray:2.9.3" | ||
- name: Creating RayJobs for Ray Data Image Resizing | ||
phases: | ||
- namespaceRange: | ||
min: 1 | ||
max: 100 | ||
replicasPerNamespace: 5 | ||
tuningSet: Uniform100qps | ||
objectBundle: | ||
- basename: ray-data-image-resize | ||
objectTemplatePath: ray-data-image-resize.yaml | ||
templateFillMap: | ||
Image: "rayproject/ray:2.9.3" | ||
- name: Wait for RayJobs complete | ||
measurements: | ||
- Identifier: WaitForRayJob | ||
Method: Exec | ||
Params: | ||
timeout: 30m | ||
command: | ||
- "bash" | ||
- "common/wait-for-rayjobs.sh" | ||
- "500" # 1000 since we deploy two RayJobs with 500 instances each | ||
- name: Measure wait for pods to be running | ||
measurements: | ||
- Identifier: WaitForControlledPodsRunning | ||
Method: WaitForControlledPodsRunning | ||
Params: | ||
action: gather | ||
operationTimeout: 10m | ||
- name: Measure pod startup latency | ||
measurements: | ||
- Identifier: PodStartupLatency | ||
Method: PodStartupLatency | ||
Params: | ||
action: gather | ||
- name: Measure job finished | ||
measurements: | ||
- Identifier: JobLifecycleLatency | ||
Method: JobLifecycleLatency | ||
Params: | ||
action: gather |
63 changes: 63 additions & 0 deletions
63
benchmark/perf-tests/1000-rayjob/pytorch-mnist-rayjob.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
apiVersion: ray.io/v1 | ||
kind: RayJob | ||
metadata: | ||
name: {{.Name}} | ||
labels: | ||
perf-test: rayjob-pytorch-mnist | ||
spec: | ||
shutdownAfterJobFinishes: true | ||
entrypoint: python ray_train_pytorch_mnist.py | ||
submitterPodTemplate: | ||
spec: | ||
restartPolicy: Never | ||
containers: | ||
- name: submitter-job | ||
image: {{.Image}} | ||
command: | ||
- "sh" | ||
- "-c" | ||
args: | ||
- | | ||
#!/bin/sh | ||
ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \ | ||
ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"NUM_WORKERS":"2","CPUS_PER_WORKER":"1","OMP_NUM_THREADS":"1"}}' -- python ray_train_pytorch_mnist.py | ||
resources: | ||
requests: | ||
cpu: "10m" | ||
rayClusterSpec: | ||
rayVersion: '2.9.3' | ||
headGroupSpec: | ||
rayStartParams: | ||
disable-usage-stats: 'true' | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: {{.Image}} | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs-server | ||
- containerPort: 8265 | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
resources: | ||
requests: | ||
cpu: "100m" | ||
memory: "4Gi" | ||
workerGroupSpecs: | ||
- replicas: 2 | ||
minReplicas: 1 | ||
maxReplicas: 5 | ||
groupName: worker-group | ||
rayStartParams: {} | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: {{.Image}} | ||
resources: | ||
requests: | ||
cpu: "100m" | ||
memory: "4Gi" |
63 changes: 63 additions & 0 deletions
63
benchmark/perf-tests/1000-rayjob/ray-data-image-resize.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
apiVersion: ray.io/v1 | ||
kind: RayJob | ||
metadata: | ||
name: {{.Name}} | ||
labels: | ||
perf-test: ray-data-image-resize | ||
spec: | ||
shutdownAfterJobFinishes: true | ||
entrypoint: python ray_data_image_resize.py | ||
submitterPodTemplate: | ||
spec: | ||
restartPolicy: Never | ||
containers: | ||
- name: submitter-job | ||
image: {{.Image}} | ||
command: | ||
- "sh" | ||
- "-c" | ||
args: | ||
- | | ||
#!/bin/sh | ||
ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \ | ||
ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"BUCKET_NAME":"ray-images","BUCKET_PREFIX":"images"}}' -- python ray_data_image_resize.py | ||
resources: | ||
requests: | ||
cpu: "10m" | ||
rayClusterSpec: | ||
rayVersion: '2.9.3' | ||
headGroupSpec: | ||
rayStartParams: | ||
disable-usage-stats: 'true' | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: {{.Image}} | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs-server | ||
- containerPort: 8265 | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
resources: | ||
requests: | ||
cpu: "100m" | ||
memory: "2Gi" | ||
workerGroupSpecs: | ||
- replicas: 2 | ||
minReplicas: 1 | ||
maxReplicas: 5 | ||
groupName: worker-group | ||
rayStartParams: {} | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: {{.Image}} | ||
resources: | ||
requests: | ||
cpu: "100m" | ||
memory: "2Gi" |
Oops, something went wrong.