Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[INFRA-4283] make runner able to iterate through zones #50

Closed
wants to merge 26 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 87 additions & 61 deletions action.sh
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ function gcloud_auth {

function start_vm {
echo "Starting GCE VM ..."

if [[ -z "${service_account_key}" ]] || [[ -z "${project_id}" ]]; then
echo "Won't authenticate gcloud. If you wish to authenticate gcloud provide both service_account_key and project_id."
else
Expand All @@ -187,48 +188,49 @@ function start_vm {
no_external_address_flag=$([[ "${no_external_address}" == "true" ]] && echo "--no-address" || echo "")
network_flag=$([[ ! -z "${network}" ]] && echo "--network=${network}" || echo "")
subnet_flag=$([[ ! -z "${subnet}" ]] && echo "--subnet=${subnet}" || echo "")
accel_only=$(echo ${accelerator} | awk -F'[=,]' '{print $2}')
accelerator=$([[ ! -z "${accelerator}" ]] && echo "--accelerator=${accelerator} --maintenance-policy=TERMINATE" || echo "")
maintenance_policy_flag=$([[ -z "${maintenance_policy_terminate}" ]] || echo "--maintenance-policy=TERMINATE" )

echo "The new GCE VM will be ${VM_ID}"

startup_script="
# Create a systemd service in charge of shutting down the machine once the workflow has finished
cat <<-EOF > /etc/systemd/system/shutdown.sh
#!/bin/sh
sleep ${shutdown_timeout}
gcloud compute instances delete $VM_ID --zone=$machine_zone --quiet
EOF
# Create a systemd service in charge of shutting down the machine once the workflow has finished
cat <<-EOS > /etc/systemd/system/shutdown.sh
#!/bin/sh
sleep ${shutdown_timeout}
gcloud compute instances delete $VM_ID --zone=$machine_zone --quiet
EOS

cat <<-EOF > /etc/systemd/system/shutdown.service
[Unit]
Description=Shutdown service
[Service]
ExecStart=/etc/systemd/system/shutdown.sh
[Install]
WantedBy=multi-user.target
EOF
cat <<-EOSD > /etc/systemd/system/shutdown.service
[Unit]
Description=Shutdown service
[Service]
ExecStart=/etc/systemd/system/shutdown.sh
[Install]
WantedBy=multi-user.target
EOSD

chmod +x /etc/systemd/system/shutdown.sh
systemctl daemon-reload
systemctl enable shutdown.service
chmod +x /etc/systemd/system/shutdown.sh
systemctl daemon-reload
systemctl enable shutdown.service

cat <<-EOF > /usr/bin/gce_runner_shutdown.sh
#!/bin/sh
echo \"✅ Self deleting $VM_ID in ${machine_zone} in ${shutdown_timeout} seconds ...\"
# We tear down the machine by starting the systemd service that was registered by the startup script
systemctl start shutdown.service
EOF
cat <<-EOSDR > /usr/bin/gce_runner_shutdown.sh
#!/bin/sh
echo \"✅ Self deleting $VM_ID in ${machine_zone} in ${shutdown_timeout} seconds ...\"
# We tear down the machine by starting the systemd service that was registered by the startup script
systemctl start shutdown.service
EOSDR

# See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job
echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env
gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\
RUNNER_ALLOW_RUNASROOT=1 ./config.sh --url https://github.com/${GITHUB_REPOSITORY} --token ${RUNNER_TOKEN} --labels ${VM_ID} --unattended ${ephemeral_flag} --disableupdate && \\
./svc.sh install && \\
./svc.sh start && \\
gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=1
# 3 days represents the max workflow runtime. This will shutdown the instance if everything else fails.
nohup sh -c \"sleep 3d && gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone}\" > /dev/null &
# See: https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job
echo "ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/usr/bin/gce_runner_shutdown.sh" >.env
gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=0 && \\
RUNNER_ALLOW_RUNASROOT=1 ./config.sh --url https://github.com/${GITHUB_REPOSITORY} --token ${RUNNER_TOKEN} --labels ${VM_ID} --unattended ${ephemeral_flag} --disableupdate && \\
./svc.sh install && \\
./svc.sh start && \\
gcloud compute instances add-labels ${VM_ID} --zone=${machine_zone} --labels=gh_ready=1
# 3 days represents the max workflow runtime. This will shutdown the instance if everything else fails.
nohup sh -c \"sleep 3d && gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone}\" > /dev/null &
"

if $actions_preinstalled ; then
Expand Down Expand Up @@ -256,11 +258,12 @@ function start_vm {
./bin/installdependencies.sh && \\
$startup_script"
else
startup_script="#!/bin/bash
startup_script="#! /bin/bash
mkdir /actions-runner
cd /actions-runner
curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz
curl -o actions-runner-linux-x64-${runner_ver}.tar.gz -sS -L https://github.com/actions/runner/releases/download/v${runner_ver}/actions-runner-linux-x64-${runner_ver}.tar.gz
tar xzf ./actions-runner-linux-x64-${runner_ver}.tar.gz
export DEBIAN_FRONTEND=noninteractive
./bin/installdependencies.sh && \\
$startup_script"
fi
Expand Down Expand Up @@ -291,39 +294,62 @@ function start_vm {
gh_repo="$(truncate_to_label "${GITHUB_REPOSITORY##*/}")"
gh_run_id="${GITHUB_RUN_ID}"

gcloud compute instances create ${VM_ID} \
--zone=${machine_zone} \
${disk_size_flag} \
${boot_disk_type_flag} \
--machine-type=${machine_type} \
--scopes=${scopes} \
${service_account_flag} \
${image_project_flag} \
${image_flag} \
${image_family_flag} \
${preemptible_flag} \
${no_external_address_flag} \
${network_flag} \
${subnet_flag} \
${accelerator} \
${maintenance_policy_flag} \
--labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \
--metadata=startup-script="$startup_script" \
&& echo "label=${VM_ID}" >> $GITHUB_OUTPUT

function create_vm {
echo "🔄 Attempting to create VM in zone: ${machine_zone}"
gcloud compute instances create ${VM_ID} \
--zone=${machine_zone} \
${disk_size_flag} \
${boot_disk_type_flag} \
--machine-type=${machine_type} \
--scopes=${scopes} \
${service_account_flag} \
${image_project_flag} \
${image_flag} \
${image_family_flag} \
${preemptible_flag} \
${no_external_address_flag} \
${network_flag} \
${subnet_flag} \
${accelerator} \
${maintenance_policy_flag} \
--labels=gh_ready=0,gh_repo_owner="${gh_repo_owner}",gh_repo="${gh_repo}",gh_run_id="${gh_run_id}" \
--metadata=startup-script="$startup_script"
}
safety_off
while (( i++ < 60 )); do
if [[ -z "${accelerator}" ]]; then
create_vm
else
zones=$(gcloud compute accelerator-types list --verbosity=error --filter="name=${accel_only} AND zone:us-*" --format="value(zone)" | shuf)
for zone in $zones; do
machine_zone=$zone
create_vm
[[ $? -eq 0 ]] && break
done
fi

if [[ $? -eq 1 ]]; then
echo "❌ Failed to create GCE VM"
exit 1
fi
echo "✅ Successfully created GCE VM in zone: ${machine_zone}"
echo "label=${VM_ID}" >> $GITHUB_OUTPUT

count=120
interval=10
seconds=$(( $count * $interval ))
minutes=$(( $seconds / 60 ))
while (( i++ < $count )); do
GH_READY=$(gcloud compute instances describe ${VM_ID} --zone=${machine_zone} --format='json(labels)' | jq -r .labels.gh_ready)
if [[ $GH_READY == 1 ]]; then
elapsed=$(($i * $interval))
echo "✅ ${VM_ID} ready after ${elapsed} seconds"
break
fi
echo "${VM_ID} not ready yet, waiting 5 secs ..."
sleep 5
echo "${VM_ID} not ready yet, waiting $interval secs ..."
sleep $interval
done
if [[ $GH_READY == 1 ]]; then
echo "✅ ${VM_ID} ready ..."
else
echo "Waited 5 minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..."
if [[ $GH_READY != 1 ]]; then
echo "Waited $minutes minutes for ${VM_ID}, without luck, deleting ${VM_ID} ..."
gcloud --quiet compute instances delete ${VM_ID} --zone=${machine_zone}
exit 1
fi
Expand Down