From 3e43f8078dffc51378615d68e330900e44edf36f Mon Sep 17 00:00:00 2001 From: Shuyang Xin Date: Fri, 27 Dec 2024 00:28:16 +0800 Subject: [PATCH] Add Jenkins job for scaling agents * The script facilitates scaling of Jenkins agents by allowing both adding agents and removing agents. It integrates with GCP and the Jenkins API to add or remove agents dynamically based on the queue load, improving testbed efficiency. * The `add_agent` function scales up the Jenkins agents by checking if jobs with the specified label are present in the Jenkins queue. If jobs are found, it creates a new GCP VM, waits for the external IP to become available, and registers the new agent with Jenkins. * The `remove_agent` function scales down the Jenkins agents by identifying idle agents. It removes the agent from Jenkins and deletes the associated VM, ensuring that only idle agents are removed, preventing any disruption to ongoing jobs. Signed-off-by: Shuyang Xin --- ci/jenkins/jobs/projects-cloud.yaml | 36 ++++ ci/jenkins/scale-agent.sh | 285 ++++++++++++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 ci/jenkins/scale-agent.sh diff --git a/ci/jenkins/jobs/projects-cloud.yaml b/ci/jenkins/jobs/projects-cloud.yaml index c54b7f49469..b12194c4b33 100644 --- a/ci/jenkins/jobs/projects-cloud.yaml +++ b/ci/jenkins/jobs/projects-cloud.yaml @@ -861,6 +861,42 @@ recipients: projectantrea-dev@googlegroups.com rahulj@vmware.com triggers: [] wrappers: [] + - 'cloud-{name}-{test_name}': + test_name: auto-scaling-agent + node: antrea-cloud + description: 'This is cloud job of auto scaling agents for antrea.' + branches: + - '${{ANTREA_GIT_REVISION}}' + repo_url: '${{ANTREA_REPO}}' + builders: + - shell: |- + #!/bin/bash + # Prevent exposing credentials in the console output by adding set +x + # This is to avoid other developers removing this set+x by accident + set +x + sudo gcloud auth login --cred-file=${CREDENTIAL_PATH} + sudo gcloud config set project antrea + sudo ./ci/jenkins/scale-agent.sh --kind --jenkins-user ${{JENKINS_USER}} --jenkins-token ${{JENKINS_TOKEN}} + triggers: + - timed: */15 * * * * + publishers: + - archive: + allow-empty: true + case-sensitive: true + default-excludes: true + fingerprint: false + only-if-success: false + - email: + notify-every-unstable-build: true + recipients: projectantrea-dev@googlegroups.com + wrappers: + - credentials-binding: + - text: + credential-id: JENKINS_USER + variable: JENKINS_USER + - text: + credential-id: JENKINS_TOKEN + variable: JENKINS_TOKEN - '{name}-{test_name}-matrix-compatibility-test': test_name: weekly node: 'antrea-test-node' diff --git a/ci/jenkins/scale-agent.sh b/ci/jenkins/scale-agent.sh new file mode 100644 index 00000000000..4fd5d41eb5e --- /dev/null +++ b/ci/jenkins/scale-agent.sh @@ -0,0 +1,285 @@ +#!/usr/bin/env bash + +# Copyright 2024 Antrea Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eo pipefail + +function echoerr { + >&2 echo "$@" +} + +WORKDIR="/var/lib/jenkins" +RUN_SETUP_ONLY=false +RUN_CLEANUP_ONLY=false +JENKINS_URL="https://jenkins.antrea.io" +LABEL="kind" +ZONE="us-west1-a" +MACHINE_TYPE="e2-standard-4" +IMAGE_FAMILY="ubuntu-2204-lts" +IMAGE_PROJECT="ubuntu-os-cloud" +BOOT_DISK_SIZE="200GB" +AGENT_NAME_PATTERN="jenkins-agent" +NEW_AGENT_NAME="$AGENT_NAME_PATTERN-$(date +%s)" + +_usage="Usage: $0 [--workdir ] [--setup-only] [--cleanup-only] [--kind] + +Scale a jenkins agent to run CI tests. + + --workdir Home path for Go, vSphere information and antrea_logs during cluster setup. Default is $WORKDIR. + --setup-only Only perform setting up the cluster and run test. + --cleanup-only Only perform cleaning up the cluster. + --jenkins-user Jenkins user name. + --jenkins-token Jenkins API token. + --label Label for the jenkins agent. + --kind Setup kind testbed." + +function print_usage { + echoerr "$_usage" +} + +function print_help { + echoerr "Try '$0 --help' for more information." +} + +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --workdir) + WORKDIR="$2" + shift 2 + ;; + --setup-only) + RUN_SETUP_ONLY=true + shift + ;; + --cleanup-only) + RUN_CLEANUP_ONLY=true + shift + ;; + --kind) + KIND=true + shift + ;; + --jenkins-user) + JENKINS_USER="$2" + shift 2 + ;; + --jenkins-token) + JENKINS_TOKEN="$2" + shift 2 + ;; + --label) + LABEL="$2" + shift 2 + ;; + -h|--help) + print_usage + exit 0 + ;; + *) # unknown option + echoerr "Unknown option $1" + exit 1 + ;; +esac +done + +function check_jobs_in_queue { + local LABEL=$1 + local SUCCESS=0 + + for i in {1..2}; do + QUEUE_JOBS=$(curl -k -s -u "$JENKINS_USER:$JENKINS_API_TOKEN" "$JENKINS_URL/queue/api/json") + + JOB_FOUND=$(echo "$QUEUE_JOBS" | jq -r '.items[] | select(.task.labels != null) | .task.labels[]' | grep -w "$LABEL") + + if [ -n "$JOB_FOUND" ]; then + echo "Job with label $LABEL found in queue. Retrying in 10 seconds..." + SUCCESS=1 + sleep 10 + else + echo "No job with label $LABEL found in queue. Exit." + SUCCESS=0 + break + fi + done + + echo $SUCCESS +} + +function add_agent { + echo "Checking if there are jobs in the Jenkins queue with label: $LABEL" + JOB_EXISTS=$(check_jobs_in_queue "$LABEL") + + if [ "$JOB_EXISTS" -ne 0 ]; then + echo "Jobs with label $LABEL not found in the queue. Exit." + return 1 + fi + + echo "Jobs with label $LABEL exist in the queue. Proceeding with agent creation..." + sudo gcloud compute instances create "$NEW_AGENT_NAME" --zone="$ZONE" --machine-type="$MACHINE_TYPE" --image-family="$IMAGE_FAMILY" --image-project="$IMAGE_PROJECT" --boot-disk-size="$BOOT_DISK_SIZE" + if [ $? -ne 0 ]; then + echoerr "Failed to create VM instance $NEW_AGENT_NAME." + return 1 + fi + + echo "Waiting for External IP of $NEW_AGENT_NAME to be available..." + IP_READY=false + for i in {1..18}; do + external_ip=$(gcloud compute instances describe "$NEW_AGENT_NAME" --zone="$ZONE" --format='get(networkInterfaces[0].accessConfigs[0].natIP)') + + if [ -n "$external_ip" ]; then + echo "External IP $external_ip is now available." + IP_READY=true + break + else + echo "External IP not ready yet. Waiting 10 seconds..." + sleep 10 + fi + done + + if [ "$IP_READY" = false ]; then + echo "External IP for $NEW_AGENT_NAME did not become ready within 3 minutes. Exiting..." + return 1 + fi + + + cat <> node.json +{ + "name": "$NEW_AGENT_NAME", + "nodeDescription": "$NEW_AGENT_NAME", + "numExecutors": "5", + "remoteFS": "$WORKDIR", + "labelString": "$LABEL", + "mode": "EXCLUSIVE", + "": [ + "hudson.slaves.JNLPLauncher", + "hudson.slaves.RetentionStrategy$Always" + ], + "launcher": { + "stapler-class": "hudson.slaves.JNLPLauncher", + "$class": "hudson.slaves.JNLPLauncher", + "workDirSettings": { + "disabled": true, + "workDirPath": "", + "internalDir": "remoting", + "failIfWorkDirIsMissing": false + }, + "tunnel": "", + "vmargs": "" + }, + "retentionStrategy": { + "stapler-class": "hudson.slaves.RetentionStrategy$Always", + "$class": "hudson.slaves.RetentionStrategy$Always" + }, + "nodeProperties": { + "stapler-class-bag": "true", + "hudson-slaves-EnvironmentVariablesNodeProperty": { + "env": [ + { + "key": "JAVA_HOME", + "value": "/usr/lib/jvm/java-11-openjdk-amd64" + } + ] + }, + "_comment:": { + "hudson-tools-ToolLocationNodeProperty": { + "locations": [ + { + "key": "hudson.model.JDK$DescriptorImpl@JAVA-11", + "home": "/usr/bin/java" + } + ] + } + } + } +} +EOF + + response_code=$(curl -L -s -k -v -w "%{http_code}" -u $JENKINS_USER:$JENKINS_API_TOKEN -H "Content-Type:application/x-www-form-urlencoded" -X POST \ + -d "json=$(cat node.json)" "$JENKINS_URL/computer/doCreateItem?name=$NEW_AGENT_NAME&type=hudson.slaves.DumbSlave") + if [[ "$response_code" -ne 200 ]]; then + echoerr "Failed to create agent. HTTP status code: $response_code" + return 1 + fi + + jnlp_url="$JENKINS_USER/computer/$NEW_AGENT_NAME/slave-agent.jnlp" + curl -k -s -u "$JENKINS_USER:$JENKINS_API_TOKEN" "$jnlp_url" -o agent.jnlp + secret=$(grep -oP '(?<=)[^<]+' agent.jnlp | head -1) + + # Run from agent command line + gcloud compute ssh "$NEW_AGENT_NAME" --zone="$ZONE" --command \ + "curl -ksO $JENKINS_URL/jnlpJars/agent.jar && java -jar agent.jar -url $JENKINS_URL -secret $secret -name $NEW_AGENT_NAME" + + echo "Agent $NEW_AGENT_NAME has been added." +} + +function remove_agent { + AGENT_LIST=$(curl -s -u "$JENKINS_USER:$JENKINS_API_TOKEN" "$JENKINS_URL/computer/api/json" | jq -r '.computer[] | .displayName') + + if [ -z "$AGENT_LIST" ]; then + echoerr "No agents found." + return 1 + fi + + DELETED_ONCE=false + for AGENT_NAME in $AGENT_LIST; do + if [[ "$AGENT_NAME" == *"$AGENT_NAME_PATTERN"* ]]; then + echo "Checking Agent: $AGENT_NAME" + + RUNNING_JOBS=$(curl -s -u "$JENKINS_USER:$JENKINS_API_TOKEN" "$JENKINS_URL/computer/$AGENT_NAME/api/json" | jq '[.executors[] | select(.currentExecutable != null)] | length') + + if [ "$RUNNING_JOBS" -eq 0 ]; then + echo "Agent $AGENT_NAME can be safely removed" + + echo "Removing label from agent $AGENT_NAME" + curl -X POST "$JENKINS_URL/computer/$AGENT_NAME/label" --user "$JENKINS_USER:$JENKINS_API_TOKEN" -d "labels=" + + # Recheck if there are no running jobs + RUNNING_JOBS=$(curl -s -u "$JENKINS_USER:$JENKINS_API_TOKEN" "$JENKINS_URL/computer/$AGENT_NAME/api/json" | jq '[.executors[] | select(.currentExecutable != null)] | length') + + if [ "$RUNNING_JOBS" -eq 0 ]; then + # Remove agent by calling jenkins api + curl -X POST "$JENKINS_URL/computer/doDelete?name=$AGENT_NAME" --user "$JENKINS_USER:$JENKINS_API_TOKEN" + + # destory the cloud VM + sudo gcloud compute instances delete "$AGENT_NAME" --zone="$ZONE" --quiet + echo "Agent $AGENT_NAME has been deleted。" + DELETED_ONCE=true + break + else + echo "Agent $AGENT_NAME still has $RUNNING_JOBS jobs running, cannot delete." + fi + else + echo "Agent $AGENT_NAME is running $RUNNING_JOBS jobs, cannot delete." + fi + fi + done + + if [ "$DELETED_ONCE" = false ]; then + echo "No agents can be removed." + fi +} + +if [[ "$RUN_SETUP_ONLY" != true ]]; then + remove_agent + exit 0 +fi + +if [[ "$RUN_CLEANUP_ONLY" != true ]]; then + add_agent + exit 0 +fi \ No newline at end of file