From d6af494fc952070be6cadbae7ecbb3111703396f Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Fri, 28 Jun 2024 15:05:00 -0700 Subject: [PATCH 01/26] automate gpu driver install --- action.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/action.sh b/action.sh index c1911f7..83e92e5 100755 --- a/action.sh +++ b/action.sh @@ -220,6 +220,15 @@ function start_vm { systemctl start shutdown.service EOF + # Install CUDA if an accelerator is specified + if [ ! -z \"${accelerator}\" ]; then + mkdir -p /opt/google/cuda-installer + cd /opt/google/cuda-installer/ || exit + + curl -fSsL -O https://github.com/GoogleCloudPlatform/compute-gpu-installation/releases/download/cuda-installer-v1.1.0/cuda_installer.pyz + python3 cuda_installer.pyz install_cuda + fi + # See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\ From 6083c399f1b0a4d21ea583e4db0878cd2f2af3ee Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Fri, 28 Jun 2024 15:43:13 -0700 Subject: [PATCH 02/26] try installing gpu driver in setup --- action.sh | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/action.sh b/action.sh index 83e92e5..9376a4c 100755 --- a/action.sh +++ b/action.sh @@ -220,13 +220,9 @@ function start_vm { systemctl start shutdown.service EOF - # Install CUDA if an accelerator is specified - if [ ! -z \"${accelerator}\" ]; then - mkdir -p /opt/google/cuda-installer - cd /opt/google/cuda-installer/ || exit - - curl -fSsL -O https://github.com/GoogleCloudPlatform/compute-gpu-installation/releases/download/cuda-installer-v1.1.0/cuda_installer.pyz - python3 cuda_installer.pyz install_cuda + # Install driver if this is a deeplearning image is specified + if [ \"${image_project}\" == \"deeplearning-platform-release\" ]; then + sudo /opt/deeplearning/install-driver.sh fi # See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job @@ -321,18 +317,18 @@ function start_vm { && echo "label=${VM_ID}" >> $GITHUB_OUTPUT safety_off - while (( i++ < 60 )); do + while (( i++ < 70 )); do GH_READY=$(gcloud compute instances describe ${VM_ID} --zone=${machine_zone} --format='json(labels)' | jq -r .labels.gh_ready) if [[ $GH_READY == 1 ]]; then break fi echo "${VM_ID} not ready yet, waiting 5 secs ..." - sleep 5 + sleep 6 done if [[ $GH_READY == 1 ]]; then echo "✅ ${VM_ID} ready ..." else - echo "Waited 5 minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..." + echo "Waited 7 minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..." gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone} exit 1 fi From 627d3b9c0293af42dd2ab2350fcc86f684fa7a3c Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Fri, 28 Jun 2024 15:54:53 -0700 Subject: [PATCH 03/26] make delay times clear --- action.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/action.sh b/action.sh index 9376a4c..97fcc22 100755 --- a/action.sh +++ b/action.sh @@ -317,18 +317,21 @@ function start_vm { && echo "label=${VM_ID}" >> $GITHUB_OUTPUT safety_off - while (( i++ < 70 )); do + $count=70 + $interval=6 + $minutes=$(( $count * $interval / 60 )) + while (( i++ < $count )); do GH_READY=$(gcloud compute instances describe ${VM_ID} --zone=${machine_zone} --format='json(labels)' | jq -r .labels.gh_ready) if [[ $GH_READY == 1 ]]; then break fi - echo "${VM_ID} not ready yet, waiting 5 secs ..." - sleep 6 + echo "${VM_ID} not ready yet, waiting $interval secs ..." + sleep $interval done if [[ $GH_READY == 1 ]]; then echo "✅ ${VM_ID} ready ..." else - echo "Waited 7 minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..." + echo "Waited $minutes minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..." gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone} exit 1 fi From ceb28a9ea533b083b337af70a9e2992452a46959 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Fri, 28 Jun 2024 16:12:52 -0700 Subject: [PATCH 04/26] fix math --- action.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/action.sh b/action.sh index 97fcc22..92e7d4b 100755 --- a/action.sh +++ b/action.sh @@ -319,7 +319,8 @@ function start_vm { safety_off $count=70 $interval=6 - $minutes=$(( $count * $interval / 60 )) + $seconds=$(( $count * $interval )) + $minutes=$(( $seconds / 60 )) while (( i++ < $count )); do GH_READY=$(gcloud compute instances describe ${VM_ID} --zone=${machine_zone} --format='json(labels)' | jq -r .labels.gh_ready) if [[ $GH_READY == 1 ]]; then From 54a9575dfdc6373749300d0ecaa8603a3efac021 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Fri, 28 Jun 2024 16:50:44 -0700 Subject: [PATCH 05/26] fix var declaration --- action.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/action.sh b/action.sh index 92e7d4b..12bb3de 100755 --- a/action.sh +++ b/action.sh @@ -317,10 +317,10 @@ function start_vm { && echo "label=${VM_ID}" >> $GITHUB_OUTPUT safety_off - $count=70 - $interval=6 - $seconds=$(( $count * $interval )) - $minutes=$(( $seconds / 60 )) + count=70 + interval=6 + seconds=$(( $count * $interval )) + minutes=$(( $seconds / 60 )) while (( i++ < $count )); do GH_READY=$(gcloud compute instances describe ${VM_ID} --zone=${machine_zone} --format='json(labels)' | jq -r .labels.gh_ready) if [[ $GH_READY == 1 ]]; then From 11f23675f85e2b2aa2e1028f7d335ae64577e2c3 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 12:06:04 -0700 Subject: [PATCH 06/26] [INFRA-4283] search through all available zones --- action.sh | 68 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/action.sh b/action.sh index 12bb3de..91b9e88 100755 --- a/action.sh +++ b/action.sh @@ -160,8 +160,12 @@ function gcloud_auth { echo "✅ Successfully configured gcloud." } -function start_vm { - echo "Starting GCE VM ..." +function get_accelerator_zones { + local $accelerator=$(echo $1 | awk -F'[=,]' '{print $2}') + echo gcloud compute accelerator-types list --verbosity=error --filter="name=${accelerator} AND zone:us-*" --format="value(zone)" +} + +function setup { if [[ -z "${service_account_key}" ]] || [[ -z "${project_id}" ]]; then echo "Won't authenticate gcloud. If you wish to authenticate gcloud provide both service_account_key and project_id." else @@ -174,6 +178,10 @@ function start_vm { https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runners/registration-token |\ jq -r .token) echo "✅ Successfully got the GitHub Runner registration token" +} + +function start_vm { + echo "Starting GCE VM ..." VM_ID="gce-gh-runner-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" service_account_flag=$([[ -z "${runner_service_account}" ]] || echo "--service-account=${runner_service_account}") @@ -296,26 +304,42 @@ function start_vm { gh_repo="$(truncate_to_label "${GITHUB_REPOSITORY##*/}")" gh_run_id="${GITHUB_RUN_ID}" - gcloud compute instances create ${VM_ID} \ - --zone=${machine_zone} \ - ${disk_size_flag} \ - ${boot_disk_type_flag} \ - --machine-type=${machine_type} \ - --scopes=${scopes} \ - ${service_account_flag} \ - ${image_project_flag} \ - ${image_flag} \ - ${image_family_flag} \ - ${preemptible_flag} \ - ${no_external_address_flag} \ - ${network_flag} \ - ${subnet_flag} \ - ${accelerator} \ - ${maintenance_policy_flag} \ - --labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \ - --metadata=startup-script="$startup_script" \ - && echo "label=${VM_ID}" >> $GITHUB_OUTPUT - + function create_vm { + gcloud compute instances create ${VM_ID} \ + --zone=${machine_zone} \ + ${disk_size_flag} \ + ${boot_disk_type_flag} \ + --machine-type=${machine_type} \ + --scopes=${scopes} \ + ${service_account_flag} \ + ${image_project_flag} \ + ${image_flag} \ + ${image_family_flag} \ + ${preemptible_flag} \ + ${no_external_address_flag} \ + ${network_flag} \ + ${subnet_flag} \ + ${accelerator} \ + ${maintenance_policy_flag} \ + --labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \ + --metadata=startup-script="$startup_script" + } + if [[ -z "${accelerator}" ]]; then + create_vm + else + for zone in $(get_accelerator_zones $accelerator); do + echo "⚙️ Attempting creating GCE VM in zone: ${zone}" + create_vm + [[ $? -eq 0 ]] && break + done + fi + if [[ $? -eq 1 ]]; then + echo "❌ Failed to create GCE VM" + exit 1 + fi + echo "✅ Successfully created GCE VM in zone: ${zone}" + echo "label=${VM_ID}" >> $GITHUB_OUTPUT + safety_off count=70 interval=6 From 94988621b9b82c14af80ceda343a7043996bcd6c Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 12:11:49 -0700 Subject: [PATCH 07/26] fix accelerator --- action.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/action.sh b/action.sh index 91b9e88..b99d572 100755 --- a/action.sh +++ b/action.sh @@ -305,6 +305,7 @@ function start_vm { gh_run_id="${GITHUB_RUN_ID}" function create_vm { + echo "attempting to create GCE VM in zone: ${machine_zone}" gcloud compute instances create ${VM_ID} \ --zone=${machine_zone} \ ${disk_size_flag} \ @@ -328,7 +329,8 @@ function start_vm { create_vm else for zone in $(get_accelerator_zones $accelerator); do - echo "⚙️ Attempting creating GCE VM in zone: ${zone}" + machine_zone=$zone + echo "⚙️ Attempting creating GCE VM in zone: ${machine_zone}" create_vm [[ $? -eq 0 ]] && break done From c72632128d58097516eebf61a00e90111e0e1cc9 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 12:57:23 -0700 Subject: [PATCH 08/26] fix variable passing --- action.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/action.sh b/action.sh index b99d572..3d5fe6c 100755 --- a/action.sh +++ b/action.sh @@ -165,7 +165,9 @@ function get_accelerator_zones { echo gcloud compute accelerator-types list --verbosity=error --filter="name=${accelerator} AND zone:us-*" --format="value(zone)" } -function setup { +function start_vm { + echo "Starting GCE VM ..." + if [[ -z "${service_account_key}" ]] || [[ -z "${project_id}" ]]; then echo "Won't authenticate gcloud. If you wish to authenticate gcloud provide both service_account_key and project_id." else @@ -178,11 +180,7 @@ function setup { https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runners/registration-token |\ jq -r .token) echo "✅ Successfully got the GitHub Runner registration token" -} - -function start_vm { - echo "Starting GCE VM ..." - + VM_ID="gce-gh-runner-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" service_account_flag=$([[ -z "${runner_service_account}" ]] || echo "--service-account=${runner_service_account}") image_project_flag=$([[ -z "${image_project}" ]] || echo "--image-project=${image_project}") From 4aa4c112c1e13bc99183d8a73216f141116b7cf9 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:09:00 -0700 Subject: [PATCH 09/26] fix get_accelerator_zones --- action.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/action.sh b/action.sh index 3d5fe6c..bc31628 100755 --- a/action.sh +++ b/action.sh @@ -161,8 +161,8 @@ function gcloud_auth { } function get_accelerator_zones { - local $accelerator=$(echo $1 | awk -F'[=,]' '{print $2}') - echo gcloud compute accelerator-types list --verbosity=error --filter="name=${accelerator} AND zone:us-*" --format="value(zone)" + local accelerator=$(echo $1 | awk -F'[=,]' '{print $2}') + echo $(gcloud compute accelerator-types list --verbosity=error --filter="name=${accelerator} AND zone:us-*" --format="value(zone)") } function start_vm { @@ -180,7 +180,7 @@ function start_vm { https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runners/registration-token |\ jq -r .token) echo "✅ Successfully got the GitHub Runner registration token" - + VM_ID="gce-gh-runner-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" service_account_flag=$([[ -z "${runner_service_account}" ]] || echo "--service-account=${runner_service_account}") image_project_flag=$([[ -z "${image_project}" ]] || echo "--image-project=${image_project}") From 458ba567f964f676f7e92cfbddb8d8203503c785 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:16:46 -0700 Subject: [PATCH 10/26] fix var --- action.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/action.sh b/action.sh index bc31628..656d96f 100755 --- a/action.sh +++ b/action.sh @@ -337,7 +337,7 @@ function start_vm { echo "❌ Failed to create GCE VM" exit 1 fi - echo "✅ Successfully created GCE VM in zone: ${zone}" + echo "✅ Successfully created GCE VM in zone: ${machine_zone}" echo "label=${VM_ID}" >> $GITHUB_OUTPUT safety_off From 8f6576813a5123069f130fd07ec351bbc30dbe72 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:20:12 -0700 Subject: [PATCH 11/26] try to fix output --- action.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/action.sh b/action.sh index 656d96f..c272de8 100755 --- a/action.sh +++ b/action.sh @@ -161,8 +161,8 @@ function gcloud_auth { } function get_accelerator_zones { - local accelerator=$(echo $1 | awk -F'[=,]' '{print $2}') - echo $(gcloud compute accelerator-types list --verbosity=error --filter="name=${accelerator} AND zone:us-*" --format="value(zone)") + local accel=$(echo $1 | awk -F'[=,]' '{print $2}') + echo $(gcloud compute accelerator-types list --verbosity=error --filter="name=${accel} AND zone:us-*" --format="value(zone)") } function start_vm { @@ -323,6 +323,7 @@ function start_vm { --labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \ --metadata=startup-script="$startup_script" } + set -x if [[ -z "${accelerator}" ]]; then create_vm else @@ -333,6 +334,7 @@ function start_vm { [[ $? -eq 0 ]] && break done fi + set +x if [[ $? -eq 1 ]]; then echo "❌ Failed to create GCE VM" exit 1 From d82acf7e53ffc1e5522842d957cd476106745184 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:24:16 -0700 Subject: [PATCH 12/26] fix accelerator parsing --- action.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/action.sh b/action.sh index c272de8..65eb9f3 100755 --- a/action.sh +++ b/action.sh @@ -160,11 +160,6 @@ function gcloud_auth { echo "✅ Successfully configured gcloud." } -function get_accelerator_zones { - local accel=$(echo $1 | awk -F'[=,]' '{print $2}') - echo $(gcloud compute accelerator-types list --verbosity=error --filter="name=${accel} AND zone:us-*" --format="value(zone)") -} - function start_vm { echo "Starting GCE VM ..." @@ -193,6 +188,7 @@ function start_vm { no_external_address_flag=$([[ "${no_external_address}" == "true" ]] && echo "--no-address" || echo "") network_flag=$([[ ! -z "${network}" ]] && echo "--network=${network}" || echo "") subnet_flag=$([[ ! -z "${subnet}" ]] && echo "--subnet=${subnet}" || echo "") + accel_only=$(echo ${accelerator} | awk -F'[=,]' '{print $2}') accelerator=$([[ ! -z "${accelerator}" ]] && echo "--accelerator=${accelerator} --maintenance-policy=TERMINATE" || echo "") maintenance_policy_flag=$([[ -z "${maintenance_policy_terminate}" ]] || echo "--maintenance-policy=TERMINATE" ) @@ -327,7 +323,8 @@ function start_vm { if [[ -z "${accelerator}" ]]; then create_vm else - for zone in $(get_accelerator_zones $accelerator); do + zones=$(gcloud compute accelerator-types list --verbosity=error --filter="name=${accel_only} AND zone:us-*" --format="value(zone)") + for zone in $zones; do machine_zone=$zone echo "⚙️ Attempting creating GCE VM in zone: ${machine_zone}" create_vm From 0ca21536b49128e6f0aec677084054dcb08a8552 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:27:47 -0700 Subject: [PATCH 13/26] don't need set x --- action.sh | 68 +++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/action.sh b/action.sh index 65eb9f3..7ebe025 100755 --- a/action.sh +++ b/action.sh @@ -195,47 +195,47 @@ function start_vm { echo "The new GCE VM will be ${VM_ID}" startup_script=" - # Create a systemd service in charge of shutting down the machine once the workflow has finished - cat <<-EOF > /etc/systemd/system/shutdown.sh - #!/bin/sh - sleep ${shutdown_timeout} - gcloud compute instances delete $VM_ID --zone=$machine_zone --quiet - EOF + # Create a systemd service in charge of shutting down the machine once the workflow has finished + cat <<-EOF > /etc/systemd/system/shutdown.sh + #!/bin/sh + sleep ${shutdown_timeout} + gcloud compute instances delete $VM_ID --zone=$machine_zone --quiet + EOF - cat <<-EOF > /etc/systemd/system/shutdown.service - [Unit] - Description=Shutdown service - [Service] - ExecStart=/etc/systemd/system/shutdown.sh - [Install] - WantedBy=multi-user.target - EOF + cat <<-EOF > /etc/systemd/system/shutdown.service + [Unit] + Description=Shutdown service + [Service] + ExecStart=/etc/systemd/system/shutdown.sh + [Install] + WantedBy=multi-user.target + EOF - chmod +x /etc/systemd/system/shutdown.sh - systemctl daemon-reload - systemctl enable shutdown.service + chmod +x /etc/systemd/system/shutdown.sh + systemctl daemon-reload + systemctl enable shutdown.service - cat <<-EOF > /usr/bin/gce_runner_shutdown.sh - #!/bin/sh - echo \"✅ Self deleting $VM_ID in ${machine_zone} in ${shutdown_timeout} seconds ...\" - # We tear down the machine by starting the systemd service that was registered by the startup script - systemctl start shutdown.service - EOF + cat <<-EOF > /usr/bin/gce_runner_shutdown.sh + #!/bin/sh + echo \"✅ Self deleting $VM_ID in ${machine_zone} in ${shutdown_timeout} seconds ...\" + # We tear down the machine by starting the systemd service that was registered by the startup script + systemctl start shutdown.service + EOF # Install driver if this is a deeplearning image is specified if [ \"${image_project}\" == \"deeplearning-platform-release\" ]; then sudo /opt/deeplearning/install-driver.sh fi - # See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job - echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env - gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\ - RUNNER_ALLOW_RUNASROOT=1 ./config.sh --url https://github.com/${GITHUB_REPOSITORY} --token ${RUNNER_TOKEN} --labels ${VM_ID} --unattended ${ephemeral_flag} --disableupdate && \\ - ./svc.sh install && \\ - ./svc.sh start && \\ - gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=1 - # 3 days represents the max workflow runtime. This will shutdown the instance if everything else fails. - nohup sh -c \"sleep 3d && gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone}\" > /dev/null & + # See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job + echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env + gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\ + RUNNER_ALLOW_RUNASROOT=1 ./config.sh --url https://github.com/${GITHUB_REPOSITORY} --token ${RUNNER_TOKEN} --labels ${VM_ID} --unattended ${ephemeral_flag} --disableupdate && \\ + ./svc.sh install && \\ + ./svc.sh start && \\ + gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=1 + # 3 days represents the max workflow runtime. This will shutdown the instance if everything else fails. + nohup sh -c \"sleep 3d && gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone}\" > /dev/null & " if $actions_preinstalled ; then @@ -319,7 +319,7 @@ function start_vm { --labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \ --metadata=startup-script="$startup_script" } - set -x + if [[ -z "${accelerator}" ]]; then create_vm else @@ -331,7 +331,7 @@ function start_vm { [[ $? -eq 0 ]] && break done fi - set +x + if [[ $? -eq 1 ]]; then echo "❌ Failed to create GCE VM" exit 1 From 8fded23e4b7502c5e474b202de6a51893e0929f9 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:29:09 -0700 Subject: [PATCH 14/26] turn safety off --- action.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/action.sh b/action.sh index 7ebe025..9bbbbe4 100755 --- a/action.sh +++ b/action.sh @@ -319,7 +319,7 @@ function start_vm { --labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \ --metadata=startup-script="$startup_script" } - + safety_off if [[ -z "${accelerator}" ]]; then create_vm else @@ -339,7 +339,6 @@ function start_vm { echo "✅ Successfully created GCE VM in zone: ${machine_zone}" echo "label=${VM_ID}" >> $GITHUB_OUTPUT - safety_off count=70 interval=6 seconds=$(( $count * $interval )) From 0ab64e34d3cb8bd031bfd9a1d81cd7104c9e091c Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:44:02 -0700 Subject: [PATCH 15/26] allow 10min to start up --- action.sh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/action.sh b/action.sh index 9bbbbe4..68b1347 100755 --- a/action.sh +++ b/action.sh @@ -299,7 +299,7 @@ function start_vm { gh_run_id="${GITHUB_RUN_ID}" function create_vm { - echo "attempting to create GCE VM in zone: ${machine_zone}" + echo "🔄 Attempting creating GCE VM in zone: ${machine_zone}" gcloud compute instances create ${VM_ID} \ --zone=${machine_zone} \ ${disk_size_flag} \ @@ -323,10 +323,9 @@ function start_vm { if [[ -z "${accelerator}" ]]; then create_vm else - zones=$(gcloud compute accelerator-types list --verbosity=error --filter="name=${accel_only} AND zone:us-*" --format="value(zone)") + zones=$(gcloud compute accelerator-types list --verbosity=error --filter="name=${accel_only} AND zone:us-*" --format="value(zone)" | shuf) for zone in $zones; do machine_zone=$zone - echo "⚙️ Attempting creating GCE VM in zone: ${machine_zone}" create_vm [[ $? -eq 0 ]] && break done @@ -339,21 +338,21 @@ function start_vm { echo "✅ Successfully created GCE VM in zone: ${machine_zone}" echo "label=${VM_ID}" >> $GITHUB_OUTPUT - count=70 - interval=6 + count=60 + interval=10 seconds=$(( $count * $interval )) minutes=$(( $seconds / 60 )) while (( i++ < $count )); do GH_READY=$(gcloud compute instances describe ${VM_ID} --zone=${machine_zone} --format='json(labels)' | jq -r .labels.gh_ready) if [[ $GH_READY == 1 ]]; then + elapsed=$(($i * $interval)) + echo "✅ ${VM_ID} ready after ${elapsed} seconds" break fi echo "${VM_ID} not ready yet, waiting $interval secs ..." sleep $interval done - if [[ $GH_READY == 1 ]]; then - echo "✅ ${VM_ID} ready ..." - else + if [[ $GH_READY != 1 ]]; then echo "Waited $minutes minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..." gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone} exit 1 From 7923d151fdbeaba83e327f8e5f4cbf4b67fd49bb Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 13:54:35 -0700 Subject: [PATCH 16/26] fix script --- action.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/action.sh b/action.sh index 68b1347..901e36e 100755 --- a/action.sh +++ b/action.sh @@ -299,7 +299,7 @@ function start_vm { gh_run_id="${GITHUB_RUN_ID}" function create_vm { - echo "🔄 Attempting creating GCE VM in zone: ${machine_zone}" + echo "🔄 Attempting to create VM in zone: ${machine_zone}" gcloud compute instances create ${VM_ID} \ --zone=${machine_zone} \ ${disk_size_flag} \ From 7a877cfb6dcae1ac20ff10e10b3854a84964c078 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 14:52:31 -0700 Subject: [PATCH 17/26] maybe fix startup --- action.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/action.sh b/action.sh index 901e36e..e181f91 100755 --- a/action.sh +++ b/action.sh @@ -196,31 +196,31 @@ function start_vm { startup_script=" # Create a systemd service in charge of shutting down the machine once the workflow has finished - cat <<-EOF > /etc/systemd/system/shutdown.sh + cat <<-EOS > /etc/systemd/system/shutdown.sh #!/bin/sh sleep ${shutdown_timeout} gcloud compute instances delete $VM_ID --zone=$machine_zone --quiet - EOF + EOS - cat <<-EOF > /etc/systemd/system/shutdown.service + cat <<-EOSD > /etc/systemd/system/shutdown.service [Unit] Description=Shutdown service [Service] ExecStart=/etc/systemd/system/shutdown.sh [Install] WantedBy=multi-user.target - EOF + EOSD chmod +x /etc/systemd/system/shutdown.sh systemctl daemon-reload systemctl enable shutdown.service - cat <<-EOF > /usr/bin/gce_runner_shutdown.sh + cat <<-EOSDR > /usr/bin/gce_runner_shutdown.sh #!/bin/sh echo \"✅ Self deleting $VM_ID in ${machine_zone} in ${shutdown_timeout} seconds ...\" # We tear down the machine by starting the systemd service that was registered by the startup script systemctl start shutdown.service - EOF + EOSDR # Install driver if this is a deeplearning image is specified if [ \"${image_project}\" == \"deeplearning-platform-release\" ]; then @@ -260,7 +260,7 @@ function start_vm { cd /actions-runner curl -o actions-runner-linux-arm64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-arm64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-arm64-${runner_ver}.tar.gz - ./bin/installdependencies.sh && \\ + ./bin/installdependencies.sh $startup_script" else startup_script="#!/bin/bash @@ -268,7 +268,7 @@ function start_vm { cd /actions-runner curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz - ./bin/installdependencies.sh && \\ + ./bin/installdependencies.sh $startup_script" fi fi From 94740f40d0b82b3a978f850221f7c85a45d5ffb2 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 16:35:24 -0700 Subject: [PATCH 18/26] install libicu ahead of time --- action.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/action.sh b/action.sh index e181f91..ca2e701 100755 --- a/action.sh +++ b/action.sh @@ -258,6 +258,27 @@ function start_vm { startup_script="#!/bin/bash mkdir /actions-runner cd /actions-runner + # START HACK As of 2024-07-15 the runner installer hangs on trying to install libicu, so we install it manually + if [ -e /etc/debian_version ]; then + # prefer apt-get over apt + command -v apt-get + if [ $? -eq 0 ] + then + apt_get=apt-get + else + command -v apt + if [ $? -eq 0 ] + then + apt_get=apt + else + echo "Found neither 'apt-get' nor 'apt'" + print_errormessage + exit 1 + fi + fi + $apt_get update && $apt_get install -y libicu-dev + fi + # END HACK curl -o actions-runner-linux-arm64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-arm64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-arm64-${runner_ver}.tar.gz ./bin/installdependencies.sh From 5e46834ba3540d51c5a32ae655ae013b9e818b78 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 16:36:08 -0700 Subject: [PATCH 19/26] match standards --- action.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/action.sh b/action.sh index ca2e701..4c028f0 100755 --- a/action.sh +++ b/action.sh @@ -262,13 +262,11 @@ function start_vm { if [ -e /etc/debian_version ]; then # prefer apt-get over apt command -v apt-get - if [ $? -eq 0 ] - then + if [ $? -eq 0 ]; then apt_get=apt-get else command -v apt - if [ $? -eq 0 ] - then + if [ $? -eq 0 ]; then apt_get=apt else echo "Found neither 'apt-get' nor 'apt'" From 34a65154087b87f692bb369aa05d37f448bfe543 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 16:58:03 -0700 Subject: [PATCH 20/26] add libicu install in right place --- action.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/action.sh b/action.sh index 4c028f0..11409fa 100755 --- a/action.sh +++ b/action.sh @@ -255,6 +255,14 @@ function start_vm { fi echo "✅ Startup script will install GitHub Actions v$runner_ver" if $arm ; then + startup_script="#!/bin/bash + mkdir /actions-runner + cd /actions-runner + curl -o actions-runner-linux-arm64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-arm64-${runner_ver}.tar.gz + tar xzf ./actions-runner-linux-arm64-${runner_ver}.tar.gz + ./bin/installdependencies.sh + $startup_script" + else startup_script="#!/bin/bash mkdir /actions-runner cd /actions-runner @@ -277,14 +285,6 @@ function start_vm { $apt_get update && $apt_get install -y libicu-dev fi # END HACK - curl -o actions-runner-linux-arm64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-arm64-${runner_ver}.tar.gz - tar xzf ./actions-runner-linux-arm64-${runner_ver}.tar.gz - ./bin/installdependencies.sh - $startup_script" - else - startup_script="#!/bin/bash - mkdir /actions-runner - cd /actions-runner curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz ./bin/installdependencies.sh From 7e0a8e6cef9d063a707962c3abb9c0e456684b93 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 17:00:50 -0700 Subject: [PATCH 21/26] install manually --- action.sh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/action.sh b/action.sh index 11409fa..d62332b 100755 --- a/action.sh +++ b/action.sh @@ -268,21 +268,7 @@ function start_vm { cd /actions-runner # START HACK As of 2024-07-15 the runner installer hangs on trying to install libicu, so we install it manually if [ -e /etc/debian_version ]; then - # prefer apt-get over apt - command -v apt-get - if [ $? -eq 0 ]; then - apt_get=apt-get - else - command -v apt - if [ $? -eq 0 ]; then - apt_get=apt - else - echo "Found neither 'apt-get' nor 'apt'" - print_errormessage - exit 1 - fi - fi - $apt_get update && $apt_get install -y libicu-dev + apt-get update && apt-get install -y libicu-dev fi # END HACK curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz From 3eb861e5f4db27fd3acc15bb674891fc3c82f607 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 17:03:36 -0700 Subject: [PATCH 22/26] no comments allowed --- action.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/action.sh b/action.sh index d62332b..5064cc3 100755 --- a/action.sh +++ b/action.sh @@ -265,15 +265,10 @@ function start_vm { else startup_script="#!/bin/bash mkdir /actions-runner - cd /actions-runner - # START HACK As of 2024-07-15 the runner installer hangs on trying to install libicu, so we install it manually - if [ -e /etc/debian_version ]; then - apt-get update && apt-get install -y libicu-dev - fi - # END HACK + apt-get update && apt-get install -y libicu-dev curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz - ./bin/installdependencies.sh + ./bin/installdependencies.sh && \\ $startup_script" fi fi From 8e61c33c5cbdd77229aa63b9dc20dde61358c799 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 17:21:24 -0700 Subject: [PATCH 23/26] try again --- action.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/action.sh b/action.sh index 5064cc3..aa73a47 100755 --- a/action.sh +++ b/action.sh @@ -260,15 +260,16 @@ function start_vm { cd /actions-runner curl -o actions-runner-linux-arm64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-arm64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-arm64-${runner_ver}.tar.gz - ./bin/installdependencies.sh + ./bin/installdependencies.sh && \\ $startup_script" else startup_script="#!/bin/bash mkdir /actions-runner - apt-get update && apt-get install -y libicu-dev curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz - ./bin/installdependencies.sh && \\ + echo \"installing dependencies\" + ./bin/installdependencies.sh + echo \"dependencies installed\" $startup_script" fi fi From 0f917e351822ff04ec8a47dd6efe4b3b84f2894a Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 15 Jul 2024 17:26:02 -0700 Subject: [PATCH 24/26] the CD part is important --- action.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/action.sh b/action.sh index aa73a47..ef88f95 100755 --- a/action.sh +++ b/action.sh @@ -265,6 +265,7 @@ function start_vm { else startup_script="#!/bin/bash mkdir /actions-runner + cd /actions-runner curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz echo \"installing dependencies\" From d17c767dfccd57a5be863e173e018a3bd5b682fc Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Tue, 16 Jul 2024 11:09:56 -0700 Subject: [PATCH 25/26] install dependencies manually --- action.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/action.sh b/action.sh index ef88f95..19fd45c 100755 --- a/action.sh +++ b/action.sh @@ -269,7 +269,8 @@ function start_vm { curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz echo \"installing dependencies\" - ./bin/installdependencies.sh + export DEBIAN_FRONTEND=noninteractive + apt-get update && apt-get install -y libkrb5-3 zlib1g libttng-ust0 libicu66 echo \"dependencies installed\" $startup_script" fi From 6e621d2105242211b4339f1a37c00a9d8781e408 Mon Sep 17 00:00:00 2001 From: Ethan Fremen Date: Mon, 22 Jul 2024 11:44:46 -0700 Subject: [PATCH 26/26] try installing things normally --- action.sh | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/action.sh b/action.sh index 19fd45c..8b46b20 100755 --- a/action.sh +++ b/action.sh @@ -222,11 +222,6 @@ function start_vm { systemctl start shutdown.service EOSDR - # Install driver if this is a deeplearning image is specified - if [ \"${image_project}\" == \"deeplearning-platform-release\" ]; then - sudo /opt/deeplearning/install-driver.sh - fi - # See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\ @@ -263,15 +258,13 @@ function start_vm { ./bin/installdependencies.sh && \\ $startup_script" else - startup_script="#!/bin/bash + startup_script="#! /bin/bash mkdir /actions-runner cd /actions-runner - curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz + curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -sS -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz - echo \"installing dependencies\" export DEBIAN_FRONTEND=noninteractive - apt-get update && apt-get install -y libkrb5-3 zlib1g libttng-ust0 libicu66 - echo \"dependencies installed\" + ./bin/installdependencies.sh && \\ $startup_script" fi fi @@ -341,7 +334,7 @@ function start_vm { echo "✅ Successfully created GCE VM in zone: ${machine_zone}" echo "label=${VM_ID}" >> $GITHUB_OUTPUT - count=60 + count=120 interval=10 seconds=$(( $count * $interval )) minutes=$(( $seconds / 60 ))