Skip to content

Commit

Permalink
.github: deployment automation for data-plane-controller and agent-api
Browse files Browse the repository at this point in the history
Introduce workflow_dispatch actions for building and deploying these
control-plane services to Cloud Run, using workload identity federation.

Include Dockerfile infrastructure and entrypoints for placing secrets
into exepected locations.

Minor tweaks to data-plane-controller, adjusting defaults to work better
with Cloud Run and also to support IPv4 addresses for Ansible hosts,
which is required due to our dependency on Cloud Run (which only
supports IPv4, paired with Cloud NAT).
  • Loading branch information
jgraettinger committed Nov 15, 2024
1 parent e3f464a commit dfc5b27
Show file tree
Hide file tree
Showing 10 changed files with 268 additions and 9 deletions.
61 changes: 61 additions & 0 deletions .github/workflows/deploy-agent-api.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Deploy agent-api

on:
workflow_dispatch: {}
# TODO(johnny): Remove after merging.
push:
branches: [johnny/dpc-cd]

env:
CARGO_INCREMENTAL: 0 # Faster from-scratch builds.

jobs:
build:
runs-on: ubuntu-24.04
permissions:
# Permissions required of the Github token in order for
# federated identity and authorization to work.
contents: read
id-token: write
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
lfs: true

- uses: supabase/setup-cli@v1
- run: supabase start

- name: Build `agent`
run: cargo build --release -p agent

- run: mv target/release/agent crates/agent/

- name: Authenticate with GCP Workload Identity Federation
uses: google-github-actions/auth@v2
with:
service_account: [email protected]
workload_identity_provider: projects/1084703453822/locations/global/workloadIdentityPools/github-actions/providers/github-actions-provider

- name: Update Cloud Run service `agent-api`
uses: google-github-actions/deploy-cloudrun@v2
with:
service: agent-api
project_id: estuary-control
region: us-central1
source: crates/agent/
timeout: 10m

env_vars: |-
BUILDS_ROOT=gs://estuary-control/builds/
DATABASE_CA=/etc/db-ca.crt
DATABASE_URL=postgresql://[email protected]:5432/postgres
NO_COLOR=1
secrets: |-
PGPASSWORD=POSTGRES_PASSWORD:latest
CONTROL_PLANE_DB_CA_CERT=CONTROL_PLANE_DB_CA_CERT:latest
env_vars_update_strategy: overwrite
secrets_update_strategy: overwrite
64 changes: 64 additions & 0 deletions .github/workflows/deploy-data-plane-controller.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Deploy data-plane-controller

on:
workflow_dispatch: {}
# TODO(johnny): Remove after merging.
push:
branches: [johnny/dpc-cd]

env:
CARGO_INCREMENTAL: 0 # Faster from-scratch builds.

jobs:
build:
runs-on: ubuntu-24.04
permissions:
# Permissions required of the Github token in order for
# federated identity and authorization to work.
contents: read
id-token: write
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
lfs: true

- uses: supabase/setup-cli@v1
- run: supabase start

- name: Build `data-plane-controller`
run: cargo build --release -p data-plane-controller

- run: mv target/release/data-plane-controller crates/data-plane-controller/

- name: Authenticate with GCP Workload Identity Federation
uses: google-github-actions/auth@v2
with:
service_account: [email protected]
workload_identity_provider: projects/1084703453822/locations/global/workloadIdentityPools/github-actions/providers/github-actions-provider

- name: Update Cloud Run job `data-plane-controller`
uses: google-github-actions/deploy-cloudrun@v2
with:
job: data-plane-controller
project_id: estuary-control
region: us-central1
source: crates/data-plane-controller/
timeout: 2h # Self-cancels after 1 hour, with 1 hour grace period.

env_vars: |-
DPC_DATABASE_CA=/etc/db-ca.crt
DPC_DATABASE_URL=postgresql://[email protected]:5432/postgres
NO_COLOR=1
secrets: |-
CONTROL_PLANE_DB_CA_CERT=CONTROL_PLANE_DB_CA_CERT:latest
DPC_GITHUB_SSH_KEY=DPC_GITHUB_SSH_KEY:latest
DPC_IAM_CREDENTIALS=DPC_IAM_CREDENTIALS:latest
DPC_SERVICE_ACCOUNT=DPC_SERVICE_ACCOUNT:latest
PGPASSWORD=POSTGRES_PASSWORD:latest
VULTR_API_KEY=DPC_VULTR_API_KEY:latest
env_vars_update_strategy: overwrite
secrets_update_strategy: overwrite
31 changes: 31 additions & 0 deletions crates/agent/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM ubuntu:noble

# Install required packages.
RUN apt update -y \
&& apt install --no-install-recommends -y \
ca-certificates \
s3cmd \
curl \
&& rm -rf /var/lib/apt/lists/*

# Install the `sops` CLI.
RUN curl -L -o /usr/local/bin/sops \
https://github.com/getsops/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64 \
&& chmod +x /usr/local/bin/sops

# Copy in our local assets.
COPY agent /usr/local/bin/
COPY entrypoint.sh /usr/local/bin/

ENV BIN_DIR /usr/local/bin/
ENV RUST_LOG=info

CMD ["/usr/local/bin/entrypoint.sh"]

# Example of running this container locally:
# docker run --rm --net=host -it \
# -e CONTROL_PLANE_DB_CA_CERT="$(</home/johnny/Downloads/prod-ca-2021.crt)" \
# -e DATABASE_URL="${DATABASE_URL}" \
# -e PGPASSWORD=${PGPASSWORD} \
# -e BUILDS_ROOT=gs://estuary-control/builds/ \
# foobar:latest
11 changes: 11 additions & 0 deletions crates/agent/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

set -o errexit
set -o pipefail
set -o nounset

# Place secrets into expected file locations.
mkdir /root/.aws
printf '%s\n' "${CONTROL_PLANE_DB_CA_CERT}" > /etc/db-ca.crt

exec agent --allow-origin=https://dashboard.estuary.dev --allow-origin=http://localhost:3000
4 changes: 2 additions & 2 deletions crates/automations/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ models = { path = "../models", features = ["sqlx-support"] }

anyhow = { workspace = true }
futures = { workspace = true }
rand = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
sqlx = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }

[dev-dependencies]
rand = { workspace = true }
tracing-subscriber = { workspace = true }
6 changes: 5 additions & 1 deletion crates/automations/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,12 @@ async fn ready_tasks_iter(
// If permits remain, there were not enough tasks to dequeue.
// Sleep for up-to `dequeue_interval`, cancelling early if a task completes.
if permits.num_permits() != 0 {
// Jitter dequeue by 10% in either direction, to ensure
// distribution of tasks and retries across executors.
let jitter = 0.9 + rand::random::<f64>() * 0.2; // [0.9, 1.1)

tokio::select! {
() = tokio::time::sleep(dequeue_interval) => (),
() = tokio::time::sleep(dequeue_interval.mul_f64(jitter)) => (),
_ = semaphore.clone().acquire_owned() => (), // Cancel sleep.
}
}
Expand Down
49 changes: 49 additions & 0 deletions crates/data-plane-controller/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
FROM ubuntu:noble

# Install required packages.
RUN apt update -y \
&& apt install --no-install-recommends -y \
ca-certificates \
certbot \
curl \
git \
openssh-client \
python3-certbot-dns-google \
python3-poetry \
python3-venv \
&& rm -rf /var/lib/apt/lists/*

# Install the `pulumi` CLI.
RUN curl -fsSL https://get.pulumi.com/ | bash -s
RUN ln -s /root/.pulumi/bin/pulumi /usr/local/bin/pulumi

# Install the `sops` CLI.
RUN curl -L -o /usr/local/bin/sops \
https://github.com/getsops/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64 \
&& chmod +x /usr/local/bin/sops

# Copy in our local assets.
COPY data-plane-controller /usr/local/bin/
COPY entrypoint.sh /usr/local/bin/

# AWS profile to expect in ~/.aws/credentials
ENV AWS_PROFILE=data-plane-ops
# GCP Service Account JSON credentials path.
ENV GOOGLE_APPLICATION_CREDENTIALS=/etc/data_plane_controller.json
# Disable host-key checks when cloning our git repo.
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"

ENV RUST_LOG=info

CMD ["/usr/local/bin/entrypoint.sh"]

# Example of running this container locally:
# docker run --rm --net=host -it \
# -e CONTROL_PLANE_DB_CA_CERT="$(</home/johnny/Downloads/prod-ca-2021.crt)" \
# -e DPC_DATABASE_URL="${DATABASE_URL}" \
# -e DPC_GITHUB_SSH_KEY="$(</home/johnny/data_plane_controller.key)" \
# -e DPC_IAM_CREDENTIALS="$(</home/johnny/.aws/credentials)" \
# -e DPC_SERVICE_ACCOUNT="$(</etc/data_plane_controller.json)" \
# -e VULTR_API_KEY="${VULTR_API_KEY}" \
# -e PGPASSWORD=${PGPASSWORD} \
# foobar:latest
40 changes: 40 additions & 0 deletions crates/data-plane-controller/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

set -o errexit
set -o pipefail
set -o nounset

# Place secrets into expected file locations.
# The SSH key in particular requires a trailing newline.
mkdir /root/.aws
printf '%s\n' "${CONTROL_PLANE_DB_CA_CERT}" > /etc/db-ca.crt
printf '%s\n' "${DPC_GITHUB_SSH_KEY}" > /root/ssh_key
printf '%s\n' "${DPC_IAM_CREDENTIALS}" > /root/.aws/credentials
printf '%s\n' "${DPC_SERVICE_ACCOUNT}" > ${GOOGLE_APPLICATION_CREDENTIALS}

# Start background ssh-agent, evaluate output to set variables, and add SSH key.
chmod 0400 /root/ssh_key
eval "$(ssh-agent -s)"
ssh-add /root/ssh_key

# Log out the IP from which we're running.
echo "Current egress IP:"
curl -s -S http://icanhazip.com

# Start data-plane-controller in the background
data-plane-controller &
DPC_PID=$!

# Start a background timer to send SIGINT after one hour.
(
sleep 3600
kill -INT ${DPC_PID} 2>/dev/null || true
) &

# Wait for data-plane-controller to exit and surface it's status.
set +o errexit
wait ${DPC_PID}
DPC_STATUS=${?}

echo "data-plane-controller exited with status ${DPC_STATUS}"
exit ${DPC_STATUS}
7 changes: 3 additions & 4 deletions crates/data-plane-controller/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@ pub struct Args {
env = "DPC_DATABASE_URL",
default_value = "postgres://postgres:[email protected]:5432/postgres"
)]
#[serde(skip_serializing)]
database_url: url::Url,
/// Path to CA certificate of the database.
#[clap(long = "database-ca", env = "DPC_DATABASE_CA")]
database_ca: Option<String>,
/// Number of tasks which may be polled concurrently.
#[clap(long = "concurrency", env = "DPC_CONCURRENCY", default_value = "2")]
#[clap(long = "concurrency", env = "DPC_CONCURRENCY", default_value = "1")]
concurrency: u32,
/// Interval between polls for dequeue-able tasks when otherwise idle.
#[clap(
long = "dequeue-interval",
env = "DPC_DEQUEUE_INTERVAL",
default_value = "5s"
default_value = "10s"
)]
#[serde(with = "humantime_serde")]
#[arg(value_parser = humantime::parse_duration)]
Expand Down Expand Up @@ -96,7 +95,7 @@ pub async fn run(args: Args) -> anyhow::Result<()> {
}

let pg_pool = sqlx::postgres::PgPoolOptions::new()
.acquire_timeout(std::time::Duration::from_secs(5))
.acquire_timeout(std::time::Duration::from_secs(30))
.connect_with(pg_options)
.await
.context("connecting to database")?;
Expand Down
4 changes: 2 additions & 2 deletions crates/data-plane-controller/src/stack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pub struct DataPlane {
pub control_plane_api: url::Url,
pub data_buckets: Vec<url::Url>,
pub gcp_project: String,
pub ssh_subnets: Vec<ipnetwork::Ipv6Network>,
pub ssh_subnets: Vec<ipnetwork::IpNetwork>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub private_links: Vec<AWSPrivateLink>,
pub deployments: Vec<Deployment>,
Expand Down Expand Up @@ -86,7 +86,7 @@ pub struct AnsibleRole {

#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct AnsibleHost {
pub ansible_host: std::net::Ipv6Addr,
pub ansible_host: std::net::IpAddr,
pub ansible_user: String,
pub host_fqdn: String,
pub local_cert_pem: String,
Expand Down

0 comments on commit dfc5b27

Please sign in to comment.