diff --git a/.github/workflows/deploy-agent-api.yaml b/.github/workflows/deploy-agent-api.yaml new file mode 100644 index 0000000000..714aa6ef8e --- /dev/null +++ b/.github/workflows/deploy-agent-api.yaml @@ -0,0 +1,63 @@ +name: Deploy agent-api + +on: + workflow_dispatch: {} + push: + branches: [johnny/dpc-cd] + +env: + CARGO_INCREMENTAL: 0 # Faster from-scratch builds. + +jobs: + build: + runs-on: ubuntu-24.04 + permissions: + # Permissions required of the Github token in order for + # federated identity and authorization to work. + contents: read + id-token: write + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + lfs: true + + - uses: supabase/setup-cli@v1 + - run: supabase start + + - name: Build `agent` + run: cargo build --release -p agent + + - run: mv target/release/agent crates/agent/ + + - name: Authenticate with GCP Workload Identity Federation + uses: google-github-actions/auth@v2 + with: + service_account: cd-github-actions@estuary-control.iam.gserviceaccount.com + workload_identity_provider: projects/1084703453822/locations/global/workloadIdentityPools/github-actions/providers/github-actions-provider + + - name: Update Cloud Run service `agent-api` + uses: google-github-actions/deploy-cloudrun@v2 + with: + service: agent-api + project_id: estuary-control + region: us-central1 + source: crates/agent/ + timeout: 10m + + # Temporary during testing. + no_traffic: true + + env_vars: |- + BUILDS_ROOT=gs://estuary-control/builds/ + DATABASE_CA=/etc/db-ca.crt + DATABASE_URL=postgresql://postgres@db.eyrcnmuzzyriypdajwdk.supabase.co:5432/postgres + NO_COLOR=1 + + secrets: |- + PGPASSWORD=POSTGRES_PASSWORD:latest + CONTROL_PLANE_DB_CA_CERT=CONTROL_PLANE_DB_CA_CERT:latest + + # env_vars_update_strategy: overwrite + # secrets_update_strategy: overwrite diff --git a/.github/workflows/deploy-data-plane-controller.yaml b/.github/workflows/deploy-data-plane-controller.yaml new file mode 100644 index 0000000000..a587de168a --- /dev/null +++ b/.github/workflows/deploy-data-plane-controller.yaml @@ -0,0 +1,63 @@ +name: Deploy data-plane-controller + +on: + workflow_dispatch: {} + push: + branches: [johnny/dpc-cd] + +env: + CARGO_INCREMENTAL: 0 # Faster from-scratch builds. + +jobs: + build: + runs-on: ubuntu-24.04 + permissions: + # Permissions required of the Github token in order for + # federated identity and authorization to work. + contents: read + id-token: write + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + lfs: true + + - uses: supabase/setup-cli@v1 + - run: supabase start + + - name: Build `data-plane-controller` + run: cargo build --release -p data-plane-controller + + - run: mv target/release/data-plane-controller crates/data-plane-controller/ + + - name: Authenticate with GCP Workload Identity Federation + uses: google-github-actions/auth@v2 + with: + service_account: cd-github-actions@estuary-control.iam.gserviceaccount.com + workload_identity_provider: projects/1084703453822/locations/global/workloadIdentityPools/github-actions/providers/github-actions-provider + + - name: Update Cloud Run job `data-plane-controller` + uses: google-github-actions/deploy-cloudrun@v2 + with: + job: data-plane-controller + project_id: estuary-control + region: us-central1 + source: crates/data-plane-controller/ + timeout: 2h # Self-cancels after 1 hour, with 1 hour grace period. + + env_vars: |- + DPC_DATABASE_CA=/etc/db-ca.crt + DPC_DATABASE_URL=postgresql://postgres@db.eyrcnmuzzyriypdajwdk.supabase.co:5432/postgres + NO_COLOR=1 + + secrets: |- + CONTROL_PLANE_DB_CA_CERT=CONTROL_PLANE_DB_CA_CERT:latest + DPC_GITHUB_SSH_KEY=DPC_GITHUB_SSH_KEY:latest + DPC_IAM_CREDENTIALS=DPC_IAM_CREDENTIALS:latest + DPC_SERVICE_ACCOUNT=DPC_SERVICE_ACCOUNT:latest + PGPASSWORD=POSTGRES_PASSWORD:latest + VULTR_API_KEY=DPC_VULTR_API_KEY:latest + + env_vars_update_strategy: overwrite + secrets_update_strategy: overwrite diff --git a/crates/agent/Dockerfile b/crates/agent/Dockerfile new file mode 100644 index 0000000000..d645361fd2 --- /dev/null +++ b/crates/agent/Dockerfile @@ -0,0 +1,31 @@ +FROM ubuntu:noble + +# Install required packages. +RUN apt update -y \ + && apt install --no-install-recommends -y \ + ca-certificates \ + s3cmd \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install the `sops` CLI. +RUN curl -L -o /usr/local/bin/sops \ + https://github.com/getsops/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64 \ + && chmod +x /usr/local/bin/sops + +# Copy in our local assets. +COPY agent /usr/local/bin/ +COPY entrypoint.sh /usr/local/bin/ + +ENV BIN_DIR /usr/local/bin/ +ENV RUST_LOG=info + +CMD ["/usr/local/bin/entrypoint.sh"] + +# Example of running this container locally: +# docker run --rm --net=host -it \ +# -e CONTROL_PLANE_DB_CA_CERT="$( /etc/db-ca.crt + +exec agent --allow-origin=https://dashboard.estuary.dev --allow-origin=http://localhost:3000 \ No newline at end of file diff --git a/crates/automations/Cargo.toml b/crates/automations/Cargo.toml index 70a7f166ad..8b27b714ff 100644 --- a/crates/automations/Cargo.toml +++ b/crates/automations/Cargo.toml @@ -14,12 +14,12 @@ models = { path = "../models", features = ["sqlx-support"] } anyhow = { workspace = true } futures = { workspace = true } +rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sqlx = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } -tracing-subscriber = { workspace = true } [dev-dependencies] -rand = { workspace = true } +tracing-subscriber = { workspace = true } \ No newline at end of file diff --git a/crates/automations/src/server.rs b/crates/automations/src/server.rs index 92f0055156..05b520bfda 100644 --- a/crates/automations/src/server.rs +++ b/crates/automations/src/server.rs @@ -227,8 +227,12 @@ async fn ready_tasks_iter( // If permits remain, there were not enough tasks to dequeue. // Sleep for up-to `dequeue_interval`, cancelling early if a task completes. if permits.num_permits() != 0 { + // Jitter dequeue by 10% in either direction, to ensure + // distribution of tasks and retries across executors. + let jitter = 0.9 + rand::random::() * 0.2; // [0.9, 1.1) + tokio::select! { - () = tokio::time::sleep(dequeue_interval) => (), + () = tokio::time::sleep(dequeue_interval.mul_f64(jitter)) => (), _ = semaphore.clone().acquire_owned() => (), // Cancel sleep. } } diff --git a/crates/data-plane-controller/Dockerfile b/crates/data-plane-controller/Dockerfile new file mode 100644 index 0000000000..baa2cf8c34 --- /dev/null +++ b/crates/data-plane-controller/Dockerfile @@ -0,0 +1,49 @@ +FROM ubuntu:noble + +# Install required packages. +RUN apt update -y \ + && apt install --no-install-recommends -y \ + ca-certificates \ + certbot \ + curl \ + git \ + openssh-client \ + python3-certbot-dns-google \ + python3-poetry \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* + +# Install the `pulumi` CLI. +RUN curl -fsSL https://get.pulumi.com/ | bash -s +RUN ln -s /root/.pulumi/bin/pulumi /usr/local/bin/pulumi + +# Install the `sops` CLI. +RUN curl -L -o /usr/local/bin/sops \ + https://github.com/getsops/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64 + && chmod +x /usr/local/bin/sops + +# Copy in our local assets. +COPY data-plane-controller /usr/local/bin/ +COPY entrypoint.sh /usr/local/bin/ + +# AWS profile to expect in ~/.aws/credentials +ENV AWS_PROFILE=data-plane-ops +# GCP Service Account JSON credentials path. +ENV GOOGLE_APPLICATION_CREDENTIALS=/etc/data_plane_controller.json +# Disable host-key checks when cloning our git repo. +ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" + +ENV RUST_LOG=info + +CMD ["/usr/local/bin/entrypoint.sh"] + +# Example of running this container locally: +# docker run --rm --net=host -it \ +# -e CONTROL_PLANE_DB_CA_CERT="$( /etc/db-ca.crt +printf '%s\n' "${DPC_GITHUB_SSH_KEY}" > /root/ssh_key +printf '%s\n' "${DPC_IAM_CREDENTIALS}" > /root/.aws/credentials +printf '%s\n' "${DPC_SERVICE_ACCOUNT}" > ${GOOGLE_APPLICATION_CREDENTIALS} + +# Start background ssh-agent, evaluate output to set variables, and add SSH key. +chmod 0400 /root/ssh_key +eval "$(ssh-agent -s)" +ssh-add /root/ssh_key + +# Log out the IP from which we're running. +echo "Current egress IP:" +curl -s -S http://icanhazip.com + +# Start data-plane-controller in the background +data-plane-controller & +DPC_PID=$! + +# Start a background timer to send SIGINT after one hour. +( + sleep 3600 + kill -INT ${DPC_PID} 2>/dev/null || true +) & + +# Wait for data-plane-controller to exit and surface it's status. +set +o errexit +wait ${DPC_PID} +DPC_STATUS=${?} + +echo "data-plane-controller exited with status ${DPC_STATUS}" +exit ${DPC_STATUS} \ No newline at end of file diff --git a/crates/data-plane-controller/src/lib.rs b/crates/data-plane-controller/src/lib.rs index 1ee4ee4354..f4f628113f 100644 --- a/crates/data-plane-controller/src/lib.rs +++ b/crates/data-plane-controller/src/lib.rs @@ -17,19 +17,18 @@ pub struct Args { env = "DPC_DATABASE_URL", default_value = "postgres://postgres:postgres@127.0.0.1:5432/postgres" )] - #[serde(skip_serializing)] database_url: url::Url, /// Path to CA certificate of the database. #[clap(long = "database-ca", env = "DPC_DATABASE_CA")] database_ca: Option, /// Number of tasks which may be polled concurrently. - #[clap(long = "concurrency", env = "DPC_CONCURRENCY", default_value = "2")] + #[clap(long = "concurrency", env = "DPC_CONCURRENCY", default_value = "1")] concurrency: u32, /// Interval between polls for dequeue-able tasks when otherwise idle. #[clap( long = "dequeue-interval", env = "DPC_DEQUEUE_INTERVAL", - default_value = "5s" + default_value = "10s" )] #[serde(with = "humantime_serde")] #[arg(value_parser = humantime::parse_duration)] @@ -96,7 +95,7 @@ pub async fn run(args: Args) -> anyhow::Result<()> { } let pg_pool = sqlx::postgres::PgPoolOptions::new() - .acquire_timeout(std::time::Duration::from_secs(5)) + .acquire_timeout(std::time::Duration::from_secs(30)) .connect_with(pg_options) .await .context("connecting to database")?; diff --git a/crates/data-plane-controller/src/stack.rs b/crates/data-plane-controller/src/stack.rs index 0ec32aa92a..0f5276d32f 100644 --- a/crates/data-plane-controller/src/stack.rs +++ b/crates/data-plane-controller/src/stack.rs @@ -29,7 +29,7 @@ pub struct DataPlane { pub control_plane_api: url::Url, pub data_buckets: Vec, pub gcp_project: String, - pub ssh_subnets: Vec, + pub ssh_subnets: Vec, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub private_links: Vec, pub deployments: Vec, @@ -86,7 +86,7 @@ pub struct AnsibleRole { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct AnsibleHost { - pub ansible_host: std::net::Ipv6Addr, + pub ansible_host: std::net::IpAddr, pub ansible_user: String, pub host_fqdn: String, pub local_cert_pem: String,