From 8adf65d71b5a42c4685e33db9198bb78ffe016dd Mon Sep 17 00:00:00 2001 From: Tucker Beck Date: Fri, 5 Apr 2024 13:18:03 -0700 Subject: [PATCH] Revised setup for jobbergate-agent in jobbergate-composed This change moves jobbergate-agent into a stand-alone slurmd node instead of embedding it in the slurmctld node. This will more closely mimic what we do in production where agents are installed on login nodes. --- jobbergate-agent/Dockerfile.dev | 20 ----------- jobbergate-composed/Dockerfile-slurm | 21 +++++++++++- jobbergate-composed/docker-compose.yml | 38 ++++++++++++++++++--- jobbergate-composed/etc/slurm-entrypoint.sh | 20 +++++++++++ jobbergate-composed/etc/slurm.conf | 2 ++ 5 files changed, 76 insertions(+), 25 deletions(-) delete mode 100644 jobbergate-agent/Dockerfile.dev diff --git a/jobbergate-agent/Dockerfile.dev b/jobbergate-agent/Dockerfile.dev deleted file mode 100644 index 94901038a..000000000 --- a/jobbergate-agent/Dockerfile.dev +++ /dev/null @@ -1,20 +0,0 @@ -FROM slurm-docker-cluster - -WORKDIR /app - -RUN apt update && apt install -y curl libpq-dev gcc python3-dev python3-pip && \ - ln -s /usr/bin/python3 /usr/bin/python - -RUN curl -sSL https://install.python-poetry.org | \ - POETRY_HOME=/opt/poetry POETRY_VERSION=1.5.1 python && \ - ln -s /opt/poetry/bin/poetry /usr/local/bin/poetry && \ - poetry config virtualenvs.create false - -COPY ./pyproject.toml ./poetry.lock* ./README* ./LICENSE* /app/ -COPY ./etc/entrypoint.sh /app/entrypoint.sh -WORKDIR /app - -VOLUME /app/jobbergate_agent -VOLUME /jobbergate-core - -ENTRYPOINT /app/entrypoint.sh diff --git a/jobbergate-composed/Dockerfile-slurm b/jobbergate-composed/Dockerfile-slurm index d4c6e663a..12eb58d4e 100644 --- a/jobbergate-composed/Dockerfile-slurm +++ b/jobbergate-composed/Dockerfile-slurm @@ -1,4 +1,4 @@ -FROM ubuntu:jammy-20211122 +FROM ubuntu:jammy-20211122 as slurm-base # Install GOSU @@ -55,3 +55,22 @@ RUN useradd -ms /bin/bash local-user COPY etc/slurm-entrypoint.sh /usr/local/bin/slurm-entrypoint.sh ENTRYPOINT ["/usr/local/bin/slurm-entrypoint.sh"] + + + +FROM slurm-base as jobbergate-agent + +RUN apt update && apt install -y curl libpq-dev gcc python3-dev python3-pip && \ + ln -s /usr/bin/python3 /usr/bin/python + +RUN curl -sSL https://install.python-poetry.org | \ + POETRY_HOME=/opt/poetry POETRY_VERSION=1.5.1 python && \ + ln -s /opt/poetry/bin/poetry /usr/local/bin/poetry && \ + poetry config virtualenvs.create false + +WORKDIR /app + +# VOLUME /app/jobbergate_agent +# VOLUME /jobbergate-core + +# ENTRYPOINT /app/entrypoint.sh diff --git a/jobbergate-composed/docker-compose.yml b/jobbergate-composed/docker-compose.yml index bf8f5a651..aed14667a 100644 --- a/jobbergate-composed/docker-compose.yml +++ b/jobbergate-composed/docker-compose.yml @@ -225,15 +225,38 @@ services: slurmctld: build: - context: ../jobbergate-agent/ - dockerfile: Dockerfile.dev + context: . + dockerfile: Dockerfile-slurm + target: slurm-base args: - JWT_SECRET=${JWT_SECRET:-supersecret} + image: slurm-docker-cluster networks: - jobbergate-net command: ["slurmctld"] container_name: slurmctld hostname: slurmctld + volumes: + - etc_munge:/etc/munge + - var_log_slurm:/var/log/slurm + - ./slurm-fake-nfs:/nfs + - ./slurm-work-dir:/slurm-work-dir + expose: + - "6817" + + jobbergate-agent: + privileged: true + build: + context: . + dockerfile: Dockerfile-slurm + target: jobbergate-agent + args: + - JWT_SECRET=${JWT_SECRET:-supersecret} + networks: + - jobbergate-net + command: ["jobbergate-agent"] + container_name: jobbergate-agent + hostname: jobbergate-agent environment: - JOBBERGATE_AGENT_X_SLURM_USER_NAME=local-user - JOBBERGATE_AGENT_DEFAULT_SLURM_WORK_DIR=/slurm-work-dir @@ -252,20 +275,25 @@ services: - ./slurm-fake-nfs:/nfs - ./slurm-work-dir:/slurm-work-dir - ../jobbergate-agent/jobbergate_agent/:/app/jobbergate_agent + - ../jobbergate-agent/pyproject.toml:/app/pyproject.toml + - ../jobbergate-agent/poetry.lock:/app/poetry.lock + - ../jobbergate-agent/README.md:/app/README.md + - ../jobbergate-agent/LICENSE:/app/LICENSE - ../jobbergate-core/:/jobbergate-core - jobbergate-agent-cache:/cache/ expose: - - "6817" + - "6818" depends_on: jobbergate-api: condition: service_healthy - slurmdbd: + slurmctld: condition: service_started slurmdbd: build: context: . dockerfile: Dockerfile-slurm + target: slurm-base image: slurm-docker-cluster networks: - jobbergate-net @@ -287,6 +315,7 @@ services: build: context: . dockerfile: Dockerfile-slurm + target: slurm-base image: slurm-docker-cluster networks: - jobbergate-net @@ -308,6 +337,7 @@ services: build: context: . dockerfile: Dockerfile-slurm + target: slurm-base image: slurm-docker-cluster networks: - jobbergate-net diff --git a/jobbergate-composed/etc/slurm-entrypoint.sh b/jobbergate-composed/etc/slurm-entrypoint.sh index 3e6b3667a..e5f8a127a 100755 --- a/jobbergate-composed/etc/slurm-entrypoint.sh +++ b/jobbergate-composed/etc/slurm-entrypoint.sh @@ -51,4 +51,24 @@ then exec /usr/sbin/slurmd -Dvvv fi +if [ "$1" = "jobbergate-agent" ] +then + echo "---> Waiting for slurmctld to become active before starting jobbergate-agent..." + + until 2>/dev/null >/dev/tcp/slurmctld/6817 + do + echo "-- slurmctld is not available. Sleeping ..." + sleep 2 + done + echo "-- slurmctld is now active ..." + + echo "---> Starting the Slurm Node Daemon (slurmd) ..." + exec /usr/sbin/slurmd -Dvvv & + + echo "---> Starting Jobbergate-agent ..." + cd /app + poetry install + poetry run jg-run +fi + exec "$@" diff --git a/jobbergate-composed/etc/slurm.conf b/jobbergate-composed/etc/slurm.conf index 6aeda03e2..8b4ab74dd 100644 --- a/jobbergate-composed/etc/slurm.conf +++ b/jobbergate-composed/etc/slurm.conf @@ -86,6 +86,8 @@ AccountingStorageHost=slurmdbd # # COMPUTE NODES NodeName=c[1-2] RealMemory=1000 State=UNKNOWN CPUs=4 +NodeName=jobbergate-agent RealMemory=1000 State=UNKNOWN CPUs=4 # # PARTITIONS PartitionName=compute Default=yes Nodes=c[1-2] Priority=50 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP +PartitionName=login Default=no Nodes=jobbergate-agent Priority=50 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=INACTIVE