From e2890c0be5683d534011e596cd850e2839129bec Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 19 Feb 2025 14:39:39 +0100 Subject: [PATCH 1/2] Workflow for building pyxis only --- ...slurm_packages_and_pyxis.yml => pyxis.yml} | 37 ++--- Dockerfile | 148 ------------------ pyxis/Dockerfile | 72 +++++++++ 3 files changed, 91 insertions(+), 166 deletions(-) rename .github/workflows/{slurm_packages_and_pyxis.yml => pyxis.yml} (57%) delete mode 100644 Dockerfile create mode 100644 pyxis/Dockerfile diff --git a/.github/workflows/slurm_packages_and_pyxis.yml b/.github/workflows/pyxis.yml similarity index 57% rename from .github/workflows/slurm_packages_and_pyxis.yml rename to .github/workflows/pyxis.yml index 9d93ff8..6b40986 100644 --- a/.github/workflows/slurm_packages_and_pyxis.yml +++ b/.github/workflows/pyxis.yml @@ -1,4 +1,4 @@ -name: Build slurm packages +name: Build pyxis on: workflow_dispatch: @@ -17,15 +17,17 @@ jobs: matrix: slurm: - version: 24.05.5 + enroot: + - version: 3.5.0 + pyxis: + - vesion: 0.21.0 image: - - context: . + - file: pyxis/Dockerfile push: false platforms: linux/amd64 load: true - ubuntu_version: jammy - cuda_version: 12.4.1 build_args: - BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + BASE_IMAGE=ubuntu:22.04 steps: - name: Harden Runner @@ -42,26 +44,27 @@ jobs: - name: Build docker images uses: docker/build-push-action@ca877d9245402d1537745e0e356eab47c3520991 # v6.13.0 with: - context: ${{ matrix.image.context }} + file: ${{ matrix.image.file }} push: ${{ matrix.image.push }} - tags: slurm_builder:${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }} + tags: pyxis_builder:${{ matrix.pyxis.vesion }} platforms: ${{ matrix.image.platforms }} load: ${{ matrix.image.load }} build-args: | ${{ matrix.image.build_args }} SLURM_VERSION=${{ matrix.slurm.version }} + ENROOT_VERSION=${{ matrix.enroot.version }} + PYXIS_VERSION=${{ matrix.pyxis.version }} cache-from: type=local,src=.buildx-cache cache-to: type=local,dest=.buildx-cache,mode=max - - name: Create slurm_build_output directory - run: mkdir -p slurm_build_output + - name: Create pyxis_build_output directory + run: mkdir -p pyxis_build_output - name: Run Docker container and copy files run: | - container_id=$(docker create slurm_builder:${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }}) + container_id=$(docker pyxis_builder:${{ matrix.pyxis.vesion }}) docker start $container_id - docker cp $container_id:/usr/src/debs/ ./slurm_build_output/ - docker cp $container_id:/usr/src/nccl-tests/build/nccl-tests-perf.tar.gz ./slurm_build_output/ + docker cp $container_id:/usr/src/debs/ ./pyxis_build_output/ docker rm $container_id - name: Create GitHub Release and Upload DEB packages @@ -69,15 +72,13 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - tag_name: ${{ matrix.image.cuda_version }}-${{ matrix.image.ubuntu_version }}-slurm${{ matrix.slurm.version }} - name: Cuda ${{ matrix.image.cuda_version }}, Ubuntu ${{ matrix.image.ubuntu_version }}, Slurm ${{ matrix.slurm.version }} - body: "Release based on Ubuntu ${{ matrix.image.ubuntu_version }} for Slurm ${{ matrix.slurm.version }} with cuda ${{ matrix.image.cuda_version }}, nccl-tests and pyxis" + tag_name: pyxis-${{ matrix.pyxis.version }} + name: Pyxis ${{ matrix.pyxis.version }} draft: false prerelease: false files: | - slurm_build_output/debs/*.deb - slurm_build_output/nccl-tests-perf.tar.gz + pyxis_build_output/debs/*.deb - name: Cleanup - run: rm -rf slurm_build_output/*.deb + run: rm -rf pyxis_build_output/*.deb diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 06f0d4a..0000000 --- a/Dockerfile +++ /dev/null @@ -1,148 +0,0 @@ -ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 - -FROM $BASE_IMAGE - -ARG SLURM_VERSION=24.05.5 -ARG OPENMPI_VERSION=4.1.7a1 -ARG OPENMPI_SUBVERSION=1.2310055 -ARG OFED_VERSION=23.10-2.1.3.1 -ARG ENROOT_VERSION=3.5.0 -ARG PYXIS_VERSION=0.21.0 - -ARG DEBIAN_FRONTEND=noninteractive - -# Install dependencies -RUN apt-get update && \ - apt -y install \ - git \ - build-essential \ - devscripts \ - debhelper \ - fakeroot \ - wget \ - curl \ - equivs \ - autoconf \ - pkg-config \ - libssl-dev \ - libpam0g-dev \ - libtool \ - libjansson-dev \ - libjson-c-dev \ - munge \ - libmunge-dev \ - libjwt0 \ - libjwt-dev \ - libhwloc-dev \ - liblz4-dev \ - flex \ - libevent-dev \ - jq \ - squashfs-tools \ - zstd \ - zlib1g \ - zlib1g-dev \ - libpmix2 \ - libpmix-dev - -# Download Slurm -RUN cd /usr/src && \ - wget https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 && \ - tar -xvf slurm-${SLURM_VERSION}.tar.bz2 && \ - rm -rf slurm-${SLURM_VERSION}.tar.bz2 - -# Install Openmpi -RUN cd /etc/apt/sources.list.d && \ - wget https://linux.mellanox.com/public/repo/mlnx_ofed/${OFED_VERSION}/$(. /etc/os-release; echo $ID$VERSION_ID)/mellanox_mlnx_ofed.list && \ - wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | apt-key add - && \ - apt update && \ - apt install openmpi=${OPENMPI_VERSION}-${OPENMPI_SUBVERSION} - -ENV LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/targets/x86_64-linux/lib:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/lib -ENV PATH=$PATH:/usr/mpi/gcc/openmpi-${OPENMPI_VERSION}/bin - -# Build deb packages for Slurm -RUN cd /usr/src/slurm-${SLURM_VERSION} && \ - sed -i 's/--with-pmix\b/--with-pmix=\/usr\/lib\/x86_64-linux-gnu\/pmix2/' debian/rules && \ - mk-build-deps -i debian/control -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" && \ - debuild -b -uc -us - -################################################################ -# RESULT -################################################################ -# /usr/src/slurm-smd-client_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-dev_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-doc_24.05.02-1_all.deb -# /usr/src/slurm-smd-libnss-slurm_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-libpam-slurm-adopt_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-libpmi0_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-libpmi2-0_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-libslurm-perl_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-openlava_24.05.02-1_all.deb -# /usr/src/slurm-smd-sackd_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-slurmctld_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-slurmd_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-slurmdbd_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-slurmrestd_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-sview_24.05.02-1_amd64.deb -# /usr/src/slurm-smd-torque_24.05.02-1_all.deb -# /usr/src/slurm-smd_24.05.02-1_amd64.deb -################################################################ - -RUN cd /usr/src && \ - git clone https://github.com/NVIDIA/nccl-tests.git && \ - cd nccl-tests && \ - make - -################################################################ -# RESULT -################################################################ -# /usr/src/nccl-tests/build/all_gather_perf -# /usr/src/nccl-tests/build/all_reduce_perf -# /usr/src/nccl-tests/build/alltoall_perf -# /usr/src/nccl-tests/build/broadcast_perf -# /usr/src/nccl-tests/build/gather_perf -# /usr/src/nccl-tests/build/hypercube_perf -# /usr/src/nccl-tests/build/reduce_perf -# /usr/src/nccl-tests/build/reduce_scatter_perf -# /usr/src/nccl-tests/build/scatter_perf -# /usr/src/nccl-tests/build/sendrecv_perf -################################################################ - -# Install enroot (required for pyxis) -RUN curl -fSsL -o /tmp/enroot_${ENROOT_VERSION}-1_amd64.deb https://github.com/NVIDIA/enroot/releases/download/v${ENROOT_VERSION}/enroot_${ENROOT_VERSION}-1_amd64.deb && \ - curl -fSsL -o /tmp/enroot+caps_${ENROOT_VERSION}-1_amd64.deb https://github.com/NVIDIA/enroot/releases/download/v${ENROOT_VERSION}/enroot+caps_${ENROOT_VERSION}-1_amd64.deb && \ - apt install -y /tmp/*.deb && rm -rf /tmp/*.deb && \ - mkdir -m 777 /usr/share/enroot/enroot-data && \ - mkdir -m 755 /run/enroot && \ - setcap cap_sys_admin+pe /usr/bin/enroot-mksquashovlfs && \ - setcap cap_sys_admin,cap_mknod+pe /usr/bin/enroot-aufs2ovlfs - - -# Download and build pyxis deb -# TODO @itechdima: keep watching this PR https://github.com/NVIDIA/pyxis/pull/155 -RUN cd /usr/src && \ - dpkg -i /usr/src/slurm-smd_${SLURM_VERSION}-1_amd64.deb && \ - dpkg -i /usr/src/slurm-smd-dev_${SLURM_VERSION}-1_amd64.deb && \ - wget https://github.com/itechdima/pyxis/archive/refs/tags/v"$PYXIS_VERSION".tar.gz && \ - tar -xzvf v"$PYXIS_VERSION".tar.gz && \ - rm v"$PYXIS_VERSION".tar.gz && \ - cd pyxis-"$PYXIS_VERSION" && \ - sed -i 's|dh_auto_install -- prefix= libdir=/usr/lib/$(DEB_HOST_MULTIARCH) datarootdir=/usr/share|dh_auto_install -- prefix=/usr libdir=/usr/lib/x86_64-linux-gnu datarootdir=/usr/share|' debian/rules && \ - make orig && \ - make deb - -################################################################ -# RESULT -################################################################ -# ls -la ../nvslurm-plugin-pyxis*.deb -# /usr/src/nvslurm-plugin-pyxis_0.20.0-1_amd64.deb -################################################################ - -# Move deb files -RUN mkdir /usr/src/debs && \ - mv /usr/src/*.deb /usr/src/debs/ - -# Create tar.gz archive with NCCL-tests binaries -RUN cd /usr/src/nccl-tests/build && \ - tar -czvf nccl-tests-perf.tar.gz *_perf diff --git a/pyxis/Dockerfile b/pyxis/Dockerfile new file mode 100644 index 0000000..0b9a2f7 --- /dev/null +++ b/pyxis/Dockerfile @@ -0,0 +1,72 @@ +ARG BASE_IMAGE=ubuntu:22.04 + +FROM $BASE_IMAGE + +ARG SLURM_VERSION=24.05.5 +ARG ENROOT_VERSION=3.5.0 +ARG PYXIS_VERSION=0.21.0 + +ARG DEBIAN_FRONTEND=noninteractive + +# Install dependencies +RUN apt-get update && \ + apt -y install \ + wget \ + curl \ + git \ + build-essential \ + devscripts \ + debhelper \ + fakeroot \ + autoconf \ + pkg-config \ + libssl-dev \ + libpam0g-dev \ + libtool \ + libjansson-dev \ + libjson-c-dev \ + libmunge-dev \ + libhwloc-dev \ + liblz4-dev \ + flex \ + libevent-dev \ + squashfs-tools \ + zstd \ + libpmix2 \ + libpmix-dev && \ + apt clean + +# Install enroot (required for pyxis) +RUN curl -fSsL -o /tmp/enroot_${ENROOT_VERSION}-1_amd64.deb https://github.com/NVIDIA/enroot/releases/download/v${ENROOT_VERSION}/enroot_${ENROOT_VERSION}-1_amd64.deb && \ + curl -fSsL -o /tmp/enroot+caps_${ENROOT_VERSION}-1_amd64.deb https://github.com/NVIDIA/enroot/releases/download/v${ENROOT_VERSION}/enroot+caps_${ENROOT_VERSION}-1_amd64.deb && \ + apt install -y /tmp/*.deb && rm -rf /tmp/*.deb && \ + mkdir -m 777 /usr/share/enroot/enroot-data && \ + mkdir -m 755 /run/enroot && \ + setcap cap_sys_admin+pe /usr/bin/enroot-mksquashovlfs && \ + setcap cap_sys_admin,cap_mknod+pe /usr/bin/enroot-aufs2ovlfs + + +# Download and build pyxis deb +# TODO @itechdima: keep watching this PR https://github.com/NVIDIA/pyxis/pull/155 +RUN cd /usr/src && \ + wget https://github.com/nebius/slurm-deb-packages/releases/download/slurm-packages-${SLURM_VERSION}/slurm-smd_${SLURM_VERSION}-1_amd64.deb && \ + wget https://github.com/nebius/slurm-deb-packages/releases/download/slurm-packages-${SLURM_VERSION}/slurm-smd-dev_${SLURM_VERSION}-1_amd64.deb && \ + apt install -y /usr/src/*.deb && \ + wget https://github.com/itechdima/pyxis/archive/refs/tags/v"$PYXIS_VERSION".tar.gz && \ + tar -xzvf v"$PYXIS_VERSION".tar.gz && \ + rm v"$PYXIS_VERSION".tar.gz && \ + cd pyxis-"$PYXIS_VERSION" && \ + sed -i 's|dh_auto_install -- prefix= libdir=/usr/lib/$(DEB_HOST_MULTIARCH) datarootdir=/usr/share|dh_auto_install -- prefix=/usr libdir=/usr/lib/x86_64-linux-gnu datarootdir=/usr/share|' debian/rules && \ + make orig && \ + make deb + +################################################################ +# RESULT +################################################################ +# ls -la ../nvslurm-plugin-pyxis*.deb +# /usr/src/nvslurm-plugin-pyxis_0.20.0-1_amd64.deb +################################################################ + +# Move deb files +RUN mkdir /usr/src/debs && \ + mv /usr/src/nvslurm-plugin-pyxis*.deb /usr/src/debs/ From e47199d0ed58e6cad1ed30aba9971a2340d4b0ab Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 19 Feb 2025 14:59:36 +0100 Subject: [PATCH 2/2] add enroot and slurm version to the tags --- .github/workflows/pyxis.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pyxis.yml b/.github/workflows/pyxis.yml index 6b40986..78f07f0 100644 --- a/.github/workflows/pyxis.yml +++ b/.github/workflows/pyxis.yml @@ -46,7 +46,7 @@ jobs: with: file: ${{ matrix.image.file }} push: ${{ matrix.image.push }} - tags: pyxis_builder:${{ matrix.pyxis.vesion }} + tags: pyxis_builder_slurm_${{ matrix.slurm.version }}_enroot_${{ matrix.enroot.version }}:${{ matrix.pyxis.vesion }} platforms: ${{ matrix.image.platforms }} load: ${{ matrix.image.load }} build-args: | @@ -62,7 +62,7 @@ jobs: - name: Run Docker container and copy files run: | - container_id=$(docker pyxis_builder:${{ matrix.pyxis.vesion }}) + container_id=$(docker pyxis_builder_slurm_${{ matrix.slurm.version }}_enroot_${{ matrix.enroot.version }}:${{ matrix.pyxis.vesion }}) docker start $container_id docker cp $container_id:/usr/src/debs/ ./pyxis_build_output/ docker rm $container_id @@ -72,8 +72,9 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - tag_name: pyxis-${{ matrix.pyxis.version }} + tag_name: pyxis-${{ matrix.pyxis.version }}_slurm_${{ matrix.slurm.version }}_enroot_${{ matrix.enroot.version }} name: Pyxis ${{ matrix.pyxis.version }} + body: Pyxis ${{ matrix.pyxis.version }} for Slurm ${{ matrix.slurm.version }} and Enroot ${{ matrix.enroot.version }} draft: false prerelease: false files: |