Skip to content

Commit

Permalink
Merge pull request #54 from nebius/slurm_packages_only_workflow
Browse files Browse the repository at this point in the history
Build only slurm packages separate from pyxis and nccl-tests
  • Loading branch information
asteny authored Feb 19, 2025
2 parents d1af626 + e72b76c commit 8e27edb
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 17 deletions.
92 changes: 92 additions & 0 deletions .github/slurm-packages/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
ARG BASE_IMAGE=ubuntu:22.04

FROM $BASE_IMAGE

ARG SLURM_VERSION=24.05.5
ARG OPENMPI_VERSION=4.1.7a1
ARG OPENMPI_SUBVERSION=1.2310055
ARG OFED_VERSION=23.10-2.1.3.1

ARG DEBIAN_FRONTEND=noninteractive

# Install dependencies
RUN apt-get update && \
apt -y install \
git \
build-essential \
devscripts \
debhelper \
fakeroot \
wget \
curl \
equivs \
autoconf \
pkg-config \
libssl-dev \
libpam0g-dev \
libtool \
libjansson-dev \
libjson-c-dev \
munge \
libmunge-dev \
libjwt0 \
libjwt-dev \
libhwloc-dev \
liblz4-dev \
flex \
libevent-dev \
jq \
squashfs-tools \
zstd \
zlib1g \
zlib1g-dev \
libpmix2 \
libpmix-dev

# Download Slurm
RUN cd /usr/src && \
wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 && \
tar -xvf slurm-${SLURM_VERSION}.tar.bz2 && \
rm -rf slurm-${SLURM_VERSION}.tar.bz2

# Install Openmpi
RUN cd /etc/apt/sources.list.d && \
wget https://linux.mellanox.com/public/repo/mlnx_ofed/${OFED_VERSION}/$(. /etc/os-release; echo $ID$VERSION_ID)/mellanox_mlnx_ofed.list && \
wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - && \
apt update && \
apt install openmpi=${OPENMPI_VERSION}-${OPENMPI_SUBVERSION}

ENV LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/targets/x86_64-linux/lib:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib
ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin

# Build deb packages for Slurm
RUN cd /usr/src/slurm-${SLURM_VERSION} && \
sed -i 's/--with-pmix\b/--with-pmix=\/usr\/lib\/x86_64-linux-gnu\/pmix2/' debian/rules && \
mk-build-deps -i debian/control -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" && \
debuild -b -uc -us

################################################################
# RESULT
################################################################
# /usr/src/slurm-smd-client_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-dev_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-doc_24.05.02-1_all.deb
# /usr/src/slurm-smd-libnss-slurm_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-libpam-slurm-adopt_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-libpmi0_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-libpmi2-0_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-libslurm-perl_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-openlava_24.05.02-1_all.deb
# /usr/src/slurm-smd-sackd_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-slurmctld_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-slurmd_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-slurmdbd_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-slurmrestd_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-sview_24.05.02-1_amd64.deb
# /usr/src/slurm-smd-torque_24.05.02-1_all.deb
# /usr/src/slurm-smd_24.05.02-1_amd64.deb
################################################################

# Move deb files
RUN mkdir /usr/src/debs && \
mv /usr/src/*.deb /usr/src/debs/
29 changes: 12 additions & 17 deletions .github/workflows/slurm_packages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,12 @@ jobs:
slurm:
- version: 24.05.5
image:
- context: .
- file: slurm-packages/Dockerfile
push: false
platforms: linux/amd64
load: true
ubuntu_version: jammy
cuda_version: 12.4.1
build_args:
BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
BASE_IMAGE=ubuntu:22.04

steps:
- name: Harden Runner
Expand All @@ -42,9 +40,9 @@ jobs:
- name: Build docker images
uses: docker/build-push-action@ca877d9245402d1537745e0e356eab47c3520991 # v6.13.0
with:
context: ${{ matrix.image.context }}
file: ${{ matrix.image.file }}
push: ${{ matrix.image.push }}
tags: slurm_builder:${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }}
tags: slurm_packages:${{ matrix.slurm.version }}
platforms: ${{ matrix.image.platforms }}
load: ${{ matrix.image.load }}
build-args: |
Expand All @@ -53,31 +51,28 @@ jobs:
cache-from: type=local,src=.buildx-cache
cache-to: type=local,dest=.buildx-cache,mode=max

- name: Create slurm_build_output directory
run: mkdir -p slurm_build_output
- name: Create slurm_packages_output directory
run: mkdir -p slurm_packages_output

- name: Run Docker container and copy files
run: |
container_id=$(docker create slurm_builder:${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }})
container_id=$(docker create slurm_packages:${{ matrix.slurm.version }})
docker start $container_id
docker cp $container_id:/usr/src/debs/ ./slurm_build_output/
docker cp $container_id:/usr/src/nccl-tests/build/nccl-tests-perf.tar.gz ./slurm_build_output/
docker cp $container_id:/usr/src/debs/ ./slurm_packages_output/
docker rm $container_id
- name: Create GitHub Release and Upload DEB packages
uses: softprops/action-gh-release@01570a1f39cb168c169c802c3bceb9e93fb10974 # v2.1.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }}
name: Cuda ${{ matrix.image.cuda_version }}, Ubuntu ${{ matrix.image.ubuntu_version }}, Slurm ${{ matrix.slurm.version }}
body: "Release based on Ubuntu ${{ matrix.image.ubuntu_version }} for Slurm ${{ matrix.slurm.version }} with cuda ${{ matrix.image.cuda_version }}, nccl-tests and pyxis"
tag_name: slurm-packages-${{ matrix.slurm.version }}
name: Slurm packages ${{ matrix.slurm.version }}
draft: false
prerelease: false
files: |
slurm_build_output/debs/*.deb
slurm_build_output/nccl-tests-perf.tar.gz
slurm_packages_output/debs/*.deb
- name: Cleanup
run: rm -rf slurm_build_output/*.deb
run: rm -rf slurm_packages_output/*.deb
83 changes: 83 additions & 0 deletions .github/workflows/slurm_packages_and_pyxis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: Build slurm packages

on:
workflow_dispatch:

permissions:
contents: read

jobs:
build:
permissions:
contents: write # for softprops/action-gh-release to create GitHub release
runs-on: self-hosted

strategy:
fail-fast: false
matrix:
slurm:
- version: 24.05.5
image:
- context: .
push: false
platforms: linux/amd64
load: true
ubuntu_version: jammy
cuda_version: 12.4.1
build_args:
BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

steps:
- name: Harden Runner
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit

- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca # v3.9.0

- name: Build docker images
uses: docker/build-push-action@ca877d9245402d1537745e0e356eab47c3520991 # v6.13.0
with:
context: ${{ matrix.image.context }}
push: ${{ matrix.image.push }}
tags: slurm_builder:${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }}
platforms: ${{ matrix.image.platforms }}
load: ${{ matrix.image.load }}
build-args: |
${{ matrix.image.build_args }}
SLURM_VERSION=${{ matrix.slurm.version }}
cache-from: type=local,src=.buildx-cache
cache-to: type=local,dest=.buildx-cache,mode=max

- name: Create slurm_build_output directory
run: mkdir -p slurm_build_output

- name: Run Docker container and copy files
run: |
container_id=$(docker create slurm_builder:${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }})
docker start $container_id
docker cp $container_id:/usr/src/debs/ ./slurm_build_output/
docker cp $container_id:/usr/src/nccl-tests/build/nccl-tests-perf.tar.gz ./slurm_build_output/
docker rm $container_id
- name: Create GitHub Release and Upload DEB packages
uses: softprops/action-gh-release@01570a1f39cb168c169c802c3bceb9e93fb10974 # v2.1.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }}
name: Cuda ${{ matrix.image.cuda_version }}, Ubuntu ${{ matrix.image.ubuntu_version }}, Slurm ${{ matrix.slurm.version }}
body: "Release based on Ubuntu ${{ matrix.image.ubuntu_version }} for Slurm ${{ matrix.slurm.version }} with cuda ${{ matrix.image.cuda_version }}, nccl-tests and pyxis"
draft: false
prerelease: false
files: |
slurm_build_output/debs/*.deb
slurm_build_output/nccl-tests-perf.tar.gz
- name: Cleanup
run: rm -rf slurm_build_output/*.deb

0 comments on commit 8e27edb

Please sign in to comment.