diff --git a/.github/.yamllint.yaml b/.github/.yamllint.yaml index 6fcf94af..1e520243 100644 --- a/.github/.yamllint.yaml +++ b/.github/.yamllint.yaml @@ -12,3 +12,4 @@ rules: comments: disable trailing-spaces: disable empty-lines: disable + new-line-at-end-of-file: disable diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index d6351eaa..49d4ef49 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -21,3 +21,8 @@ paths: - '.*was deprecated.*' - '.*shellcheck.*:warning:.*' - '.*shellcheck.*:info:.*' + + # The security warning of head.ref being dangerous is painfully stupid. + # It's worried that the commit hash string could be malicious. (Never mind that + # an attacker generating PR's can much more easily just execute malicious code.) + - '.*github.event.pull_request.head.ref.*is potentially untrusted.*' diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index fe9f3496..5df9477f 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -252,6 +252,122 @@ jobs: if: always() run: docker stop ${{ steps.start_container.outputs.container_id }} + G4-end-to-end: + # Note this job can run multiple times in parallel because the stack name is unique + # for the run. How much we want to do this is TBD. + runs-on: self-hosted + + # Run this on any PR. + # Question: Should we wait until the other tests pass before running this? + #needs: + # - validate-setup-ee + # - test-with-k3s + # - test-sdk + + env: + PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_CICD_PAT }} + PYTHONUNBUFFERED: 1 + defaults: + run: + working-directory: cicd/pulumi + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Name the stack + run: | + # Set to expire in 60 minutes + EXPIRATION_TIME=$(($(date +%s) + 60 * 60)) + STACK_NAME=ee-cicd-${{ github.run_id }}-expires-${EXPIRATION_TIME} + echo "STACK_NAME=${STACK_NAME}" | tee -a $GITHUB_ENV + # We give the stack a name including its expiration time so that the sweeper + # (in sweeper-eeut.yaml) knows when to get rid of it. + # This saves us having to clean up here, which can be quite slow (~7 minutes for a g4) + + - name: Check that aws credentials are set + # Credentials come from an IAM profile on the runner instance + run: | + aws sts get-caller-identity + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Pulumi + run: | + curl -fsSL https://get.pulumi.com | sh + export HOME=$(eval echo ~$(whoami)) + echo "$HOME/.pulumi/bin" >> $GITHUB_PATH + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Make sure uv is working + run: | + uv --version + uv sync + uv run python --version + + - name: Check that pulumi is installed and authenticated + run: | + uv run pulumi whoami + + - name: Prepare pulumi stack + run: | + uv run pulumi stack init ${STACK_NAME} + uv run pulumi config + + - name: Pick which commit we will test + run: | + echo "This is a bit subtle." + echo "We can't just test on 'main' for fairly obvious reasons - we" + echo "want to test the code in this PR's branch. The current commit" + echo "right here is ${GITHUB_SHA}, which is likely a merge commit." + echo "Merge commits are challenging. They are what would happen if" + echo "this PR were to be merged into its base branch. But they are" + echo "ephemeral things and not available in the public repo. So the" + echo "EEUT can't just check them out. Making them available to the" + echo "EEUT would require pushing them and polluting the repo. So," + echo "for now, we are going to use the PR's head ref" + echo "${{ github.event.pull_request.head.ref }}, which is the commit" + echo "that was used to create the PR. Recognizing that this doesn't" + echo "reflect what will happen after merge. But it's simpler." + + # TODO: test on the merge commit by pushing it to the repo with a temporary + # branch, and then clean up the branch later. + + COMMIT_TO_TEST=${{ github.event.pull_request.head.ref }} + uv run pulumi config set ee-cicd:targetCommit ${COMMIT_TO_TEST} + + - name: Create the EEUT instance + run: | + uv run pulumi up --yes + + - name: Check that EE install succeeded + run: | + uv run fab connect --patience=150 + uv run fab wait-for-ee-setup + + - name: Wait for K8 to load everything + run: | + uv run fab check-k8-deployments + uv run fab check-server-port + + - name: Use groundlight sdk through EE + run: | + EEUT_IP=$(uv run pulumi stack output eeut_private_ip) + export GROUNDLIGHT_ENDPOINT=http://${EEUT_IP}:30101 + uv run groundlight whoami + uv run groundlight list-detectors + + - name: Thank the worker and shut down + if: always() + run: | + echo "Strong work, G4! Now go to sleep. The grim sweeper will visit soon." + # This saves money and frees up resources + uv run fab shutdown-instance + build-push-edge-endpoint-multiplatform: if: ${{ github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch' }} # We only run this action if all the prior test actions succeed @@ -259,6 +375,7 @@ jobs: - test-general-edge-endpoint - test-sdk - validate-setup-ee + - G4-end-to-end runs-on: ubuntu-22.04 steps: - name: Configure AWS credentials diff --git a/.github/workflows/sweeper-eeut.yaml b/.github/workflows/sweeper-eeut.yaml new file mode 100644 index 00000000..2a2f309e --- /dev/null +++ b/.github/workflows/sweeper-eeut.yaml @@ -0,0 +1,61 @@ +name: sweeper-eeut +# This workflow tears down old EEUT stacks from pulumi. +# We do this as a background sweeper job, because the teardown is VERY slow (~7 minutes for a g4) +# and we don't want to slow down the main pipeline for that. +on: + schedule: + - cron: '*/15 * * * *' # Every 15 minutes + # Note cron workflows only run from the main branch. + push: + branches: + # If you're working on this stuff, name your branch e2e-something and this will run. + - e2e* +concurrency: + group: sweeper-eeut +env: + PYTHON_VERSION: "3.11" + +jobs: + destroy-expired-eeut-stacks: + #runs-on: ubuntu-22.04 # preferably + # Currently running on self-hosted because something is wrong with the AWS perms on the GH runners. + runs-on: self-hosted + env: + PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_CICD_PAT }} + defaults: + run: + working-directory: cicd/pulumi + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Set AWS credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: us-west-2 + # TODO: move these back to GH-provided secrets + # Currently using IAM roles on the self-hosted runner instance. + #aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + #aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + #aws-session-token: ${{ secrets.AWS_SESSION_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Pulumi + run: | + curl -fsSL https://get.pulumi.com | sh + export HOME=$(eval echo ~$(whoami)) + echo "$HOME/.pulumi/bin" >> $GITHUB_PATH + + - name: Check that pulumi is installed and authenticated + run: | + set -ex + pulumi whoami + + - name: Destroy old EEUT stacks + working-directory: cicd/pulumi + run: | + ./sweep-destroy-eeut-stacks.sh diff --git a/cicd/bin/install-on-ubuntu.sh b/cicd/bin/install-on-ubuntu.sh new file mode 100755 index 00000000..9dd2aef6 --- /dev/null +++ b/cicd/bin/install-on-ubuntu.sh @@ -0,0 +1,102 @@ +#! /bin/bash +# This script is intended to run on a new ubuntu instance to set it up +# Sets up an edge-endpoint environment. +# It is tested in the CICD pipeline to install the edge-endpoint on a new +# g4dn.xlarge EC2 instance with Ubuntu 22.04LTS. + +# As a user-data script on ubuntu, this file probably lands at +# /var/lib/cloud/instance/user-data.txt +echo "Setting up Groundlight Edge Endpoint. Follow along at /var/log/cloud-init-output.log" > /etc/motd + +echo "Starting cloud init. Uptime: $(uptime)" + +# Set up signals about the status of the installation +mkdir -p /opt/groundlight/ee-install-status +touch /opt/groundlight/ee-install-status/installing +SETUP_COMPLETE=0 +record_result() { + if [ "$SETUP_COMPLETE" -eq 0 ]; then + echo "Setup failed at $(date)" + touch /opt/groundlight/ee-install-status/failed + echo "Groundlight Edge Endpoint setup FAILED. See /var/log/cloud-init-output.log for details." > /etc/motd + else + echo "Setup complete at $(date)" + echo "Groundlight Edge Endpoint setup complete. See /var/log/cloud-init-output.log for details." > /etc/motd + touch /opt/groundlight/ee-install-status/success + fi + # Remove "installing" at the end to avoid a race where there is no status + rm -f /opt/groundlight/ee-install-status/installing +} +trap record_result EXIT + +set -e # Exit on error of any command. + +wait_for_apt_lock() { + # We wait for any apt or dpkg processes to finish to avoid lock collisions + # Unattended-upgrades can hold the lock and cause the install to fail + while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do + echo "Another apt/dpkg process is running. Waiting for it to finish..." + sleep 5 + done +} + +# Install basic tools +wait_for_apt_lock +sudo apt update +wait_for_apt_lock +sudo apt install -y \ + git \ + vim \ + tmux \ + htop \ + curl \ + wget \ + tree \ + bash-completion \ + ffmpeg + +# Download the edge-endpoint code +CODE_BASE=/opt/groundlight/src/ +mkdir -p ${CODE_BASE} +cd ${CODE_BASE} +git clone https://github.com/groundlight/edge-endpoint +cd edge-endpoint/ +# The launching script should update this to a specific commit. +SPECIFIC_COMMIT="__EE_COMMIT_HASH__" +if [ -n "$SPECIFIC_COMMIT" ]; then + # See if the string got substituted. Note can't compare to the whole thing + # because that would be substituted too! + if [ "${SPECIFIC_COMMIT:0:10}" != "__EE_COMMIT" ]; then + echo "Checking out commit ${SPECIFIC_COMMIT}" + git checkout ${SPECIFIC_COMMIT} + else + echo "It appears the commit hash was not substituted. Staying on main." + fi +else + echo "A blank commit hash was provided. Staying on main." +fi + +# Set up k3s with GPU support +./deploy/bin/install-k3s-nvidia.sh + +# Set up some shell niceties +TARGET_USER="ubuntu" +echo "alias k='kubectl'" >> /home/${TARGET_USER}/.bashrc +echo "source <(kubectl completion bash)" >> /home/${TARGET_USER}/.bashrc +echo "complete -F __start_kubectl k" >> /home/${TARGET_USER}/.bashrc +echo "set -o vi" >> /home/${TARGET_USER}/.bashrc + +# Configure the edge-endpoint with environment variables +export DEPLOYMENT_NAMESPACE="gl-edge" +export INFERENCE_FLAVOR="GPU" +export GROUNDLIGHT_API_TOKEN="api_token_not_set" + +# Install the edge-endpoint +kubectl create namespace gl-edge +kubectl config set-context edge --namespace=gl-edge --cluster=default --user=default +kubectl config use-context edge +./deploy/bin/setup-ee.sh + +# Indicate that setup is complete +SETUP_COMPLETE=1 +echo "EE is installed into kubernetes, which will attempt to finish the setup." \ No newline at end of file diff --git a/cicd/pulumi/.envrc b/cicd/pulumi/.envrc new file mode 100644 index 00000000..2c4be221 --- /dev/null +++ b/cicd/pulumi/.envrc @@ -0,0 +1,3 @@ +echo "This is a uv project. Remember to 'uv run ...' everything" +uv sync + diff --git a/cicd/pulumi/.gitignore b/cicd/pulumi/.gitignore new file mode 100644 index 00000000..16286e3f --- /dev/null +++ b/cicd/pulumi/.gitignore @@ -0,0 +1,5 @@ + +*.pyc +venv/ +.venv/ +__pycache__/ diff --git a/cicd/pulumi/Pulumi.yaml b/cicd/pulumi/Pulumi.yaml new file mode 100644 index 00000000..e1637124 --- /dev/null +++ b/cicd/pulumi/Pulumi.yaml @@ -0,0 +1,11 @@ +name: ee-cicd +runtime: + name: python + options: + toolchain: uv +description: CI/CD for Edge Endpoint +config: + ee-cicd:instanceType: g4dn.xlarge + # Default to "main" so things are sensible if this doesn't get customized. + # But for testing purposes, this should be set to the specific commit you want to test. + ee-cicd:targetCommit: main diff --git a/cicd/pulumi/README.md b/cicd/pulumi/README.md new file mode 100644 index 00000000..da4ffab5 --- /dev/null +++ b/cicd/pulumi/README.md @@ -0,0 +1,5 @@ +# Pulumi automation + +Pulumi automation to build an EE from scratch in EC2 and run basic integration tests. + + diff --git a/cicd/pulumi/__main__.py b/cicd/pulumi/__main__.py new file mode 100644 index 00000000..f4676d50 --- /dev/null +++ b/cicd/pulumi/__main__.py @@ -0,0 +1,77 @@ +import subprocess + +import boto3 +import pulumi +import pulumi_aws as aws + +config = pulumi.Config("ee-cicd") +instance_type = config.require("instanceType") +stackname = pulumi.get_stack() + +# We're creating an "edge endpoint under test" (eeut) + +# Find network resources we need. +eeut_sg = aws.ec2.get_security_group(filters=[{ + "name": "tag:Name", + "values": ["eeut-sg"] +}]) +subnet = aws.ec2.get_subnet(filters=[{ + "name": "tag:Name", + "values": ["cicd-subnet"] +}]) + +def get_instance_profile_by_tag(tag_key: str, tag_value: str) -> str: + """Fetches the instance profile name by tag. + Pulumi should do this, but their get_instance_profile doesn't support filtering. + """ + iam_client = boto3.client("iam") + paginator = iam_client.get_paginator("list_instance_profiles") + + for page in paginator.paginate(): + for profile in page["InstanceProfiles"]: + # Check if the profile has the desired tag + tags = iam_client.list_instance_profile_tags(InstanceProfileName=profile["InstanceProfileName"]) + for tag in tags["Tags"]: + if tag["Key"] == tag_key and tag["Value"] == tag_value: + return profile["InstanceProfileName"] + raise ValueError(f"No instance profile found with tag {tag_key}: {tag_value}") + +def get_target_commit() -> str: + """Gets the target commit hash.""" + target_commit = config.require("targetCommit") + if target_commit == "main": + target_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() + print(f"Using target commit {target_commit}") + return target_commit + +def load_user_data_script() -> str: + """Loads and customizes the user data script for the instance, which is used to install + everything on the instance.""" + with open('../bin/install-on-ubuntu.sh', 'r') as file: + user_data_script = file.read() + target_commit = get_target_commit() + user_data_script = user_data_script.replace("__EE_COMMIT_HASH__", target_commit) + return user_data_script + +instance_profile_name = get_instance_profile_by_tag("Name", "edge-device-instance-profile") + +eeut_instance = aws.ec2.Instance("ee-cicd-instance", + instance_type=instance_type, + ami="ami-0d2047d61ff42e139", # Deep Learning OSS Nvidia Driver AMI GPU PyTorch 2.5 (Ubuntu 22.04) x86/64 + key_name="ghar2eeut", + vpc_security_group_ids=[eeut_sg.id], + subnet_id=subnet.id, + user_data=load_user_data_script(), + associate_public_ip_address=True, + iam_instance_profile=instance_profile_name, + root_block_device={ + "volume_size": 100, + "volume_type": "gp3", + }, + tags={ + "Name": f"eeut-{stackname}", + }, +) + +pulumi.export("eeut_instance_id", eeut_instance.id) +pulumi.export("eeut_private_ip", eeut_instance.private_ip) diff --git a/cicd/pulumi/fabfile.py b/cicd/pulumi/fabfile.py new file mode 100644 index 00000000..e560755a --- /dev/null +++ b/cicd/pulumi/fabfile.py @@ -0,0 +1,226 @@ +""" +Fabric tools to connect to the EEUT and see how it's doing. +""" +from functools import lru_cache +from typing import Callable +import os +import time +import io + +from fabric import task, Connection, Config +from invoke import run as local +import boto3 +import paramiko + +def fetch_secret(secret_id: str) -> str: + """Fetches a secret from AWS Secrets Manager.""" + client = boto3.client("secretsmanager", region_name="us-west-2") + response = client.get_secret_value(SecretId=secret_id) + return response['SecretString'] + +def get_eeut_ip() -> str: + """Gets the EEUT's IP address from Pulumi.""" + return local("pulumi stack output eeut_private_ip", hide=True).stdout.strip() + +def connect_server() -> Connection: + """Connects to the EEUT, using the private key stored in AWS Secrets Manager.""" + ip = get_eeut_ip() + try: + private_key = fetch_secret("ghar2eeut-private-key") + private_key_file = io.StringIO(private_key) + key = paramiko.Ed25519Key.from_private_key(private_key_file) + conn = Connection( + ip, + user='ubuntu', + connect_kwargs={"pkey": key}, + ) + conn.run(f"echo 'Successfully logged in to {ip}'") + return conn + except paramiko.ssh_exception.SSHException as e: + print(f"Failed to connect to {ip}") + raise + + +class InfrequentUpdater: + """Displays messages as they happen, but don't repeat the same message too often.""" + + def __init__(self, how_often: float = 30): + self.how_often = how_often + self.last_update = 0 + self.last_msg = "" + + def maybe_update(self, msg: str): + """Displays a message if it's been long enough since the last message, and the same. + New messages are always displayed.""" + if msg == self.last_msg: + if time.time() - self.last_update < self.how_often: + return + print(msg) + self.last_msg = msg + self.last_update = time.time() + +@task +def connect(c, patience: int = 30): + """Just connect to a server to validate connection is working. + + Args: + patience (int): Number of seconds to keep retrying for. + """ + print("Fab/fabric is working. Connecting to server...") + updater = InfrequentUpdater() + start_time = time.time() + while time.time() - start_time < patience: + try: + connect_server() + print("Successfully connected to server.") + return + except Exception as e: + updater.maybe_update(f"Failed to connect to server: {e}") + time.sleep(3) + raise RuntimeError(f"Failed to connect to server after {patience} seconds.") + + +class StatusFileChecker(InfrequentUpdater): + """Encapsulates all the logic for checking status files.""" + + def __init__(self, conn: Connection, path: str): + super().__init__() + self.conn = conn + self.path = path + self.last_update = 0 + self.last_msg = "" + + def check_for_file(self, name: str) -> bool: + """Checks if a file is present in the EEUT's install status directory.""" + with self.conn.cd(self.path): + result = self.conn.run(f"test -f {name}", warn=True) + return result.ok + + def which_status_file(self) -> str: + """Returns the name of the status file if it exists, or None if it doesn't.""" + with self.conn.cd(self.path): + if self.check_for_file("installing"): + return "installing" + if self.check_for_file("success"): + return "success" + if self.check_for_file("failed"): + return "failed" + return None + + def wait_for_any_status(self, wait_minutes: int = 10) -> str: + """Waits for the EEUT to begin setup. This is a brand new sleepy server + rubbing its eyes and waking up. Give it a bit to start doing something. + """ + start_time = time.time() + while time.time() - start_time < 60 * wait_minutes: + try: + status_file = self.which_status_file() + self.maybe_update(f"Found status file: {status_file}") + if status_file: + return status_file + except Exception as e: + self.maybe_update(f"Unable to check status file: {e}") + time.sleep(2) + raise RuntimeError(f"No status file found after {wait_minutes} minutes.") + + def wait_for_success(self, wait_minutes: int = 10) -> bool: + """Waits for the EEUT to finish setup. If it fails, prints the log.""" + start_time = time.time() + while time.time() - start_time < 60 * wait_minutes: + if self.check_for_file("success"): + return True + if self.check_for_file("failed"): + print("EE installation failed. Printing complete log...") + self.conn.run("cat /var/log/cloud-init-output.log") + raise RuntimeError("EE installation failed.") + self.maybe_update(f"Waiting for success or failed status file to appear...") + time.sleep(2) + raise RuntimeError(f"EE installation check timed out after {wait_minutes} minutes.") + +@task +def wait_for_ee_setup(c, wait_minutes: int = 10): + """Waits for the EEUT to finish setup. If it fails, prints the log.""" + conn = connect_server() + checker = StatusFileChecker(conn, "/opt/groundlight/ee-install-status") + print("Waiting for any status file to appear...") + checker.wait_for_any_status(wait_minutes=wait_minutes/2) + print("Waiting for success status file to appear...") + checker.wait_for_success(wait_minutes=wait_minutes) + print("EE installation complete.") + + +def wait_for_condition(conn: Connection, condition: Callable[[Connection], bool], wait_minutes: int = 10) -> bool: + """Waits for a condition to be true. Returns True if the condition is true, False otherwise.""" + updater = InfrequentUpdater() + start_time = time.time() + name = condition.__name__ + while time.time() - start_time < 60 * wait_minutes: + try: + if condition(conn): + print(f"Condition {name} is true. Moving on.") + return True + else: + updater.maybe_update(f"Condition {name} is false. Still waiting...") + except Exception as e: + updater.maybe_update(f"Condition {name} failed: {e}. Will retry...") + time.sleep(2) + print(f"Condition {name} timed out after {wait_minutes} minutes.") + return False + +@task +def check_k8_deployments(c): + """Checks that the edge-endpoint deployment goes online. + """ + conn = connect_server() + def can_run_kubectl(conn: Connection) -> bool: + conn.run("kubectl get pods") # If this works at all, we're happy + return True + if not wait_for_condition(conn, can_run_kubectl): + raise RuntimeError("Failed to run kubectl.") + def see_deployments(conn: Connection) -> bool: + out = conn.run("kubectl get deployments", hide=True) + # Need to see the edge-endpoint deployment + return "edge-endpoint" in out.stdout + if not wait_for_condition(conn, see_deployments): + conn.run("kubectl get all -A", hide=True) + raise RuntimeError("Failed to see edge-endpoint deployment.") + def edge_endpoint_ready(conn: Connection) -> bool: + out = conn.run("kubectl get deployments edge-endpoint", hide=True) + return "1/1" in out.stdout + if not wait_for_condition(conn, edge_endpoint_ready): + conn.run("kubectl get deployments edge-endpoint -o yaml") + conn.run("kubectl describe deployments edge-endpoint") + conn.run("kubectl logs deployment/edge-endpoint") + raise RuntimeError("Failed to see edge-endpoint deployment ready.") + +@task +def check_server_port(c): + """Checks that the server is listening on the service port.""" + # First check that it's visible from the EEUT's localhost + conn = connect_server() + print(f"Checking that the server is listening on port 30101 from the EEUT's localhost...") + conn.run("nc -zv localhost 30101") + + print(f"Checking that the server is reachable from here...") + eeut_ip = get_eeut_ip() + local(f"nc -zv {eeut_ip} 30101") + + print("Server port check complete.") + + +@task +def full_check(c): + """Runs all the checks in order.""" + connect(c) + wait_for_ee_setup(c) + check_k8_deployments(c) + check_server_port(c) + + +@task +def shutdown_instance(c): + """Shuts down the EEUT instance.""" + conn = connect_server() + # Tell it to shutdown in 2 minutes, so it doesn't die while we're still connected. + conn.run("sudo shutdown +2") + print("Instance will shutdown in 2 minutes. Disconnecting...") \ No newline at end of file diff --git a/cicd/pulumi/pyproject.toml b/cicd/pulumi/pyproject.toml new file mode 100644 index 00000000..2a7cb30e --- /dev/null +++ b/cicd/pulumi/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "pulumi-ee-cicd" +version = "0.1.0" +description = "Pulumi code for Edge Endpoint CI/CD" +requires-python = ">=3.10" +dependencies = [ + "pulumi>=3", + "pulumi-aws>=6.13.3", + "boto3>=1.36.1", + "groundlight>=0.21.3", + "fabric>=3.2.2", +] diff --git a/cicd/pulumi/ssh-to-eeut.sh b/cicd/pulumi/ssh-to-eeut.sh new file mode 100755 index 00000000..3986786c --- /dev/null +++ b/cicd/pulumi/ssh-to-eeut.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# This is an odd script. It will only work from the GHA runner. +# and it expects to have access to the pulumi stack here. Which means +# to use it you'd have to log into the runner and clone the EE repo. +# But if that's what you need to do, this will help. + +# Alternately, you can use this from a workstation that has pulumi access, +# but not network access, and define the proxy host in the EEUT_PROXY_HOST +# variable. Or you might set the EEUT_PROXY_HOST with an IP address and an SSH key like +# export EEUT_PROXY_HOST="1.2.3.4 -i ~/.ssh/runner-admin.pem" + +set -x + +if [ ! -f ~/.ssh/ghar2eeut.pem ]; then + aws secretsmanager get-secret-value --secret-id "ghar2eeut-private-key" | jq .SecretString -r > ~/.ssh/ghar2eeut.pem + chmod 600 ~/.ssh/ghar2eeut.pem +fi + +EEUT_IP=$(pulumi stack output eeut_private_ip) + +if [ -n "$EEUT_PROXY_HOST" ]; then + PROXY_COMMAND=(-o ProxyCommand="ssh -W %h:%p ubuntu@$EEUT_PROXY_HOST") +else + PROXY_COMMAND=() +fi + +ssh -i ~/.ssh/ghar2eeut.pem "${PROXY_COMMAND[@]}" ubuntu@$EEUT_IP + diff --git a/cicd/pulumi/sweep-destroy-eeut-stacks.sh b/cicd/pulumi/sweep-destroy-eeut-stacks.sh new file mode 100755 index 00000000..ba9931de --- /dev/null +++ b/cicd/pulumi/sweep-destroy-eeut-stacks.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# This script is run by the sweeper-eeut.yaml GitHub Actions workflow. +# It destroys all EEUT stacks that have expired. + +set -e + +destroy_stack() { + # We don't need to make this super robust (retrying a lot) because if it fails, + # we'll try again next cron time. + STACK_NAME=$1 + pulumi stack select $STACK_NAME + INSTANCE_ID=$(pulumi stack output eeut_instance_id 2>/dev/null || echo "") + if [ -n "$INSTANCE_ID" ]; then + # Note pulumi is too stupid to terminate an instance in the stopped state. + # So we check for this manually. + INSTANCE_STATE=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query 'Reservations[*].Instances[*].State.Name' --output text) + if [ "$INSTANCE_STATE" == "stopped" ]; then + echo "Instance $INSTANCE_ID is stopped. Terminating..." + aws ec2 terminate-instances --instance-ids $INSTANCE_ID || echo "Failed to terminate instance $INSTANCE_ID" + fi + fi + pulumi destroy --yes || echo "Failed to destroy stack $STACK_NAME" + pulumi stack rm $STACK_NAME --yes || echo "Failed to remove stack $STACK_NAME" + echo -e "Stack $STACK_NAME destroyed\n\n" +} + +# Stack output JSON looks like: +#[ +# { +# # the pipeline YAML puts an expiration time (epochs) in the stack name +# "name": "ee-cicd-1234-expires-1737243595", +# "current": true, +# "lastUpdate": "2025-01-18T00:58:02.000Z", +# "updateInProgress": false, +# "resourceCount": 0, +# "url": "https://app.pulumi.com/something/ee-cicd/tmpdel" +# }, {...} +#] + +STACKS_JSON=$(pulumi stack ls --json) +NUM_STACKS=$(echo "$STACKS_JSON" | jq -r '. | length') +echo "Found $NUM_STACKS total stacks" + +for ((i=0; i