diff --git a/action.sh b/action.sh index c1911f7..8b46b20 100755 --- a/action.sh +++ b/action.sh @@ -162,6 +162,7 @@ function gcloud_auth { function start_vm { echo "Starting GCE VM ..." + if [[ -z "${service_account_key}" ]] || [[ -z "${project_id}" ]]; then echo "Won't authenticate gcloud. If you wish to authenticate gcloud provide both service_account_key and project_id." else @@ -187,48 +188,49 @@ function start_vm { no_external_address_flag=$([[ "${no_external_address}" == "true" ]] && echo "--no-address" || echo "") network_flag=$([[ ! -z "${network}" ]] && echo "--network=${network}" || echo "") subnet_flag=$([[ ! -z "${subnet}" ]] && echo "--subnet=${subnet}" || echo "") + accel_only=$(echo ${accelerator} | awk -F'[=,]' '{print $2}') accelerator=$([[ ! -z "${accelerator}" ]] && echo "--accelerator=${accelerator} --maintenance-policy=TERMINATE" || echo "") maintenance_policy_flag=$([[ -z "${maintenance_policy_terminate}" ]] || echo "--maintenance-policy=TERMINATE" ) echo "The new GCE VM will be ${VM_ID}" startup_script=" - # Create a systemd service in charge of shutting down the machine once the workflow has finished - cat <<-EOF > /etc/systemd/system/shutdown.sh - #!/bin/sh - sleep ${shutdown_timeout} - gcloud compute instances delete $VM_ID --zone=$machine_zone --quiet - EOF + # Create a systemd service in charge of shutting down the machine once the workflow has finished + cat <<-EOS > /etc/systemd/system/shutdown.sh + #!/bin/sh + sleep ${shutdown_timeout} + gcloud compute instances delete $VM_ID --zone=$machine_zone --quiet + EOS - cat <<-EOF > /etc/systemd/system/shutdown.service - [Unit] - Description=Shutdown service - [Service] - ExecStart=/etc/systemd/system/shutdown.sh - [Install] - WantedBy=multi-user.target - EOF + cat <<-EOSD > /etc/systemd/system/shutdown.service + [Unit] + Description=Shutdown service + [Service] + ExecStart=/etc/systemd/system/shutdown.sh + [Install] + WantedBy=multi-user.target + EOSD - chmod +x /etc/systemd/system/shutdown.sh - systemctl daemon-reload - systemctl enable shutdown.service + chmod +x /etc/systemd/system/shutdown.sh + systemctl daemon-reload + systemctl enable shutdown.service - cat <<-EOF > /usr/bin/gce_runner_shutdown.sh - #!/bin/sh - echo \"✅ Self deleting $VM_ID in ${machine_zone} in ${shutdown_timeout} seconds ...\" - # We tear down the machine by starting the systemd service that was registered by the startup script - systemctl start shutdown.service - EOF + cat <<-EOSDR > /usr/bin/gce_runner_shutdown.sh + #!/bin/sh + echo \"✅ Self deleting $VM_ID in ${machine_zone} in ${shutdown_timeout} seconds ...\" + # We tear down the machine by starting the systemd service that was registered by the startup script + systemctl start shutdown.service + EOSDR - # See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job - echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env - gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\ - RUNNER_ALLOW_RUNASROOT=1 ./config.sh --url https://github.com/${GITHUB_REPOSITORY} --token ${RUNNER_TOKEN} --labels ${VM_ID} --unattended ${ephemeral_flag} --disableupdate && \\ - ./svc.sh install && \\ - ./svc.sh start && \\ - gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=1 - # 3 days represents the max workflow runtime. This will shutdown the instance if everything else fails. - nohup sh -c \"sleep 3d && gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone}\" > /dev/null & + # See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job + echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env + gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\ + RUNNER_ALLOW_RUNASROOT=1 ./config.sh --url https://github.com/${GITHUB_REPOSITORY} --token ${RUNNER_TOKEN} --labels ${VM_ID} --unattended ${ephemeral_flag} --disableupdate && \\ + ./svc.sh install && \\ + ./svc.sh start && \\ + gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=1 + # 3 days represents the max workflow runtime. This will shutdown the instance if everything else fails. + nohup sh -c \"sleep 3d && gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone}\" > /dev/null & " if $actions_preinstalled ; then @@ -256,11 +258,12 @@ function start_vm { ./bin/installdependencies.sh && \\ $startup_script" else - startup_script="#!/bin/bash + startup_script="#! /bin/bash mkdir /actions-runner cd /actions-runner - curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz + curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -sS -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz + export DEBIAN_FRONTEND=noninteractive ./bin/installdependencies.sh && \\ $startup_script" fi @@ -291,39 +294,62 @@ function start_vm { gh_repo="$(truncate_to_label "${GITHUB_REPOSITORY##*/}")" gh_run_id="${GITHUB_RUN_ID}" - gcloud compute instances create ${VM_ID} \ - --zone=${machine_zone} \ - ${disk_size_flag} \ - ${boot_disk_type_flag} \ - --machine-type=${machine_type} \ - --scopes=${scopes} \ - ${service_account_flag} \ - ${image_project_flag} \ - ${image_flag} \ - ${image_family_flag} \ - ${preemptible_flag} \ - ${no_external_address_flag} \ - ${network_flag} \ - ${subnet_flag} \ - ${accelerator} \ - ${maintenance_policy_flag} \ - --labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \ - --metadata=startup-script="$startup_script" \ - && echo "label=${VM_ID}" >> $GITHUB_OUTPUT - + function create_vm { + echo "🔄 Attempting to create VM in zone: ${machine_zone}" + gcloud compute instances create ${VM_ID} \ + --zone=${machine_zone} \ + ${disk_size_flag} \ + ${boot_disk_type_flag} \ + --machine-type=${machine_type} \ + --scopes=${scopes} \ + ${service_account_flag} \ + ${image_project_flag} \ + ${image_flag} \ + ${image_family_flag} \ + ${preemptible_flag} \ + ${no_external_address_flag} \ + ${network_flag} \ + ${subnet_flag} \ + ${accelerator} \ + ${maintenance_policy_flag} \ + --labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \ + --metadata=startup-script="$startup_script" + } safety_off - while (( i++ < 60 )); do + if [[ -z "${accelerator}" ]]; then + create_vm + else + zones=$(gcloud compute accelerator-types list --verbosity=error --filter="name=${accel_only} AND zone:us-*" --format="value(zone)" | shuf) + for zone in $zones; do + machine_zone=$zone + create_vm + [[ $? -eq 0 ]] && break + done + fi + + if [[ $? -eq 1 ]]; then + echo "❌ Failed to create GCE VM" + exit 1 + fi + echo "✅ Successfully created GCE VM in zone: ${machine_zone}" + echo "label=${VM_ID}" >> $GITHUB_OUTPUT + + count=120 + interval=10 + seconds=$(( $count * $interval )) + minutes=$(( $seconds / 60 )) + while (( i++ < $count )); do GH_READY=$(gcloud compute instances describe ${VM_ID} --zone=${machine_zone} --format='json(labels)' | jq -r .labels.gh_ready) if [[ $GH_READY == 1 ]]; then + elapsed=$(($i * $interval)) + echo "✅ ${VM_ID} ready after ${elapsed} seconds" break fi - echo "${VM_ID} not ready yet, waiting 5 secs ..." - sleep 5 + echo "${VM_ID} not ready yet, waiting $interval secs ..." + sleep $interval done - if [[ $GH_READY == 1 ]]; then - echo "✅ ${VM_ID} ready ..." - else - echo "Waited 5 minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..." + if [[ $GH_READY != 1 ]]; then + echo "Waited $minutes minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..." gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone} exit 1 fi