Skip to content

Commit

Permalink
Buildkite support for the --kubernetes flag with EKS/GKE cluster (#…
Browse files Browse the repository at this point in the history
…4684)

* run on gke

* support eks test

* require eks

* replace all require_gke

* resolve conflict

* rename mark to resource_heavy and support both GKE and EKS

* remove mark for test_launch_fast
  • Loading branch information
zpoint authored Feb 12, 2025
1 parent 0a077c4 commit b2d0333
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 6 deletions.
12 changes: 10 additions & 2 deletions .buildkite/generate_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,12 @@
QUEUE_GENERIC_CLOUD = 'generic_cloud'
QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
QUEUE_KUBERNETES = 'kubernetes'
QUEUE_EKS = 'eks'
QUEUE_GKE = 'gke'
# We use KUBE_BACKEND to specify the queue for kubernetes tests mark as
# resource_heavy. It can be either EKS or GKE.
QUEUE_KUBE_BACKEND = os.getenv('KUBE_BACKEND', QUEUE_EKS).lower()
assert QUEUE_KUBE_BACKEND in [QUEUE_EKS, QUEUE_GKE]
# Only aws, gcp, azure, and kubernetes are supported for now.
# Other clouds do not have credentials.
CLOUD_QUEUE_MAP = {
Expand Down Expand Up @@ -174,7 +179,9 @@ def _extract_marked_tests(
for function_name, marks in function_name_marks_map.items():
clouds_to_include = []
is_serve_test = 'serve' in marks
run_on_gke = 'requires_gke' in marks
run_on_cloud_kube_backend = ('resource_heavy' in marks and
'kubernetes' in default_clouds_to_run)

for mark in marks:
if mark not in PYTEST_TO_CLOUD_KEYWORD:
# This mark does not specify a cloud, so we skip it.
Expand Down Expand Up @@ -210,7 +217,8 @@ def _extract_marked_tests(
param_list += [None
] * (len(final_clouds_to_include) - len(param_list))
function_cloud_map[function_name] = (final_clouds_to_include, [
QUEUE_GKE if run_on_gke else cloud_queue_map[cloud]
QUEUE_KUBE_BACKEND
if run_on_cloud_kube_backend else cloud_queue_map[cloud]
for cloud in final_clouds_to_include
], param_list)

Expand Down
12 changes: 9 additions & 3 deletions tests/smoke_tests/test_cluster_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
@pytest.mark.no_scp # SCP does not have T4 gpus. Run test_scp_job_queue instead
@pytest.mark.no_paperspace # Paperspace does not have T4 gpus.
@pytest.mark.no_oci # OCI does not have T4 gpus
@pytest.mark.resource_heavy
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
def test_job_queue(generic_cloud: str, accelerator: Dict[str, str]):
accelerator = accelerator.get(generic_cloud, 'T4')
Expand Down Expand Up @@ -267,6 +268,7 @@ def test_job_queue_multinode(generic_cloud: str, accelerator: Dict[str, str]):
@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs
@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs
@pytest.mark.no_vast # Vast doesn't guarantee exactly 8 CPUs, only at least.
@pytest.mark.resource_heavy
def test_large_job_queue(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
Expand Down Expand Up @@ -313,6 +315,7 @@ def test_large_job_queue(generic_cloud: str):
@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs
@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs
@pytest.mark.no_vast # No Vast Cloud VM has 8 CPUs
@pytest.mark.resource_heavy
def test_fast_large_job_queue(generic_cloud: str):
# This is to test the jobs can be scheduled quickly when there are many jobs in the queue.
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -401,6 +404,7 @@ def test_docker_preinstalled_package(generic_cloud: str):
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
@pytest.mark.no_oci # OCI Cloud does not have T4 gpus
@pytest.mark.no_do # DO does not have T4 gpus
@pytest.mark.resource_heavy
def test_multi_echo(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
Expand Down Expand Up @@ -444,6 +448,7 @@ def test_multi_echo(generic_cloud: str):
@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus
@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA
@pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead.
@pytest.mark.resource_heavy
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
def test_huggingface(generic_cloud: str, accelerator: Dict[str, str]):
accelerator = accelerator.get(generic_cloud, 'T4')
Expand Down Expand Up @@ -575,7 +580,6 @@ def test_tpu_vm_pod():


# ---------- TPU Pod Slice on GKE. ----------
@pytest.mark.requires_gke
@pytest.mark.kubernetes
@pytest.mark.skip
def test_tpu_pod_slice_gke():
Expand Down Expand Up @@ -695,6 +699,7 @@ def test_azure_http_server_with_custom_ports():

# ---------- Web apps with custom ports on Kubernetes. ----------
@pytest.mark.kubernetes
@pytest.mark.resource_heavy
def test_kubernetes_http_server_with_custom_ports():
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
Expand Down Expand Up @@ -888,7 +893,7 @@ def test_add_and_remove_pod_annotations_with_autostop():


# ---------- Container logs from task on Kubernetes ----------
@pytest.mark.requires_gke
@pytest.mark.resource_heavy
@pytest.mark.kubernetes
def test_container_logs_multinode_kubernetes():
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -1256,6 +1261,7 @@ def test_cancel_azure():
@pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
@pytest.mark.no_vast # Vast does not support num_nodes > 1 yet
@pytest.mark.resource_heavy
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
def test_cancel_pytorch(generic_cloud: str, accelerator: Dict[str, str]):
accelerator = accelerator.get(generic_cloud, 'T4')
Expand Down Expand Up @@ -1445,7 +1451,7 @@ def test_aws_custom_image():
smoke_tests_utils.run_one_test(test)


@pytest.mark.requires_gke
@pytest.mark.resource_heavy
@pytest.mark.kubernetes
@pytest.mark.parametrize(
'image_id',
Expand Down
3 changes: 3 additions & 0 deletions tests/smoke_tests/test_managed_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
# when the controller being on Azure, which takes a long time for launching
# step.
@pytest.mark.managed_jobs
@pytest.mark.resource_heavy
def test_managed_jobs_basic(generic_cloud: str):
"""Test the managed jobs yaml."""
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -698,6 +699,7 @@ def test_managed_jobs_retry_logs(generic_cloud: str):
@pytest.mark.no_do # DO does not support spot instances
@pytest.mark.no_vast # Uses other clouds
@pytest.mark.managed_jobs
@pytest.mark.resource_heavy
def test_managed_jobs_storage(generic_cloud: str):
"""Test storage with managed job"""
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -884,6 +886,7 @@ def test_managed_jobs_tpu():
# ---------- Testing env for managed jobs ----------
@pytest.mark.no_vast # Uses unsatisfiable machines
@pytest.mark.managed_jobs
@pytest.mark.resource_heavy
def test_managed_jobs_inline_env(generic_cloud: str):
"""Test managed jobs env"""
name = smoke_tests_utils.get_cluster_name()
Expand Down
1 change: 1 addition & 0 deletions tests/smoke_tests/test_mount_and_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ def test_kubernetes_context_switch():


@pytest.mark.no_vast # Requires AWS
@pytest.mark.resource_heavy
@pytest.mark.parametrize(
'image_id',
[
Expand Down
14 changes: 13 additions & 1 deletion tests/smoke_tests/test_sky_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_skyserve_azure_http():

@pytest.mark.kubernetes
@pytest.mark.serve
@pytest.mark.requires_gke
@pytest.mark.resource_heavy
def test_skyserve_kubernetes_http():
"""Test skyserve on Kubernetes"""
name = _get_service_name()
Expand All @@ -241,6 +241,7 @@ def test_skyserve_oci_http():
@pytest.mark.no_vast # Vast has low availability of T4 GPUs
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_llm(generic_cloud: str, accelerator: Dict[str, str]):
"""Test skyserve with real LLM usecase"""
accelerator = accelerator.get(generic_cloud, 'T4')
Expand Down Expand Up @@ -370,6 +371,7 @@ def test_skyserve_dynamic_ondemand_fallback():
@pytest.mark.no_do # DO does not support `--cpus 2`
@pytest.mark.serve
@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.resource_heavy
def test_skyserve_user_bug_restart(generic_cloud: str):
"""Tests that we restart the service after user bug."""
# TODO(zhwu): this behavior needs some rethinking.
Expand Down Expand Up @@ -471,6 +473,7 @@ def test_skyserve_auto_restart():

@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_cancel(generic_cloud: str):
"""Test skyserve with cancel"""
name = _get_service_name()
Expand All @@ -497,6 +500,7 @@ def test_skyserve_cancel(generic_cloud: str):

@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_streaming(generic_cloud: str):
"""Test skyserve with streaming"""
name = _get_service_name()
Expand Down Expand Up @@ -541,6 +545,7 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str):

@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_large_readiness_timeout(generic_cloud: str):
"""Test skyserve with customized large readiness timeout"""
name = _get_service_name()
Expand All @@ -563,6 +568,7 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str):
@pytest.mark.no_do # DO does not support `--cpus 2`
@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_update(generic_cloud: str):
"""Test skyserve with update"""
name = _get_service_name()
Expand Down Expand Up @@ -595,6 +601,7 @@ def test_skyserve_update(generic_cloud: str):
@pytest.mark.no_do # DO does not support `--cpus 2`
@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_rolling_update(generic_cloud: str):
"""Test skyserve with rolling update"""
name = _get_service_name()
Expand Down Expand Up @@ -633,6 +640,7 @@ def test_skyserve_rolling_update(generic_cloud: str):
@pytest.mark.no_fluidstack
@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_fast_update(generic_cloud: str):
"""Test skyserve with fast update (Increment version of old replicas)"""
name = _get_service_name()
Expand Down Expand Up @@ -675,6 +683,7 @@ def test_skyserve_fast_update(generic_cloud: str):

@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_update_autoscale(generic_cloud: str):
"""Test skyserve update with autoscale"""
name = _get_service_name()
Expand Down Expand Up @@ -781,6 +790,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
@pytest.mark.no_do # DO does not support `--cpus 2`
@pytest.mark.no_vast # Vast doesn't support opening ports
@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_failures(generic_cloud: str):
"""Test replica failure statuses"""
name = _get_service_name()
Expand Down Expand Up @@ -828,6 +838,7 @@ def test_skyserve_failures(generic_cloud: str):


@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_https(generic_cloud: str):
"""Test skyserve with https"""
name = _get_service_name()
Expand Down Expand Up @@ -865,6 +876,7 @@ def test_skyserve_https(generic_cloud: str):


@pytest.mark.serve
@pytest.mark.resource_heavy
def test_skyserve_multi_ports(generic_cloud: str):
"""Test skyserve with multiple ports"""
name = _get_service_name()
Expand Down

0 comments on commit b2d0333

Please sign in to comment.