From 13222dff209de5bcc1e0bdf331255d2b269c1c4a Mon Sep 17 00:00:00 2001 From: Andri Saar Date: Thu, 27 Jun 2024 17:27:59 +0000 Subject: [PATCH] Build (and install) nvidia drivers in the nvidia_base_image This is a bit hacky as we effectively rebuild the whole kernel while creating the system image so that we could get the kernel-headers package, which is required for DKMS to build the nvidia drivers. We don't use the built kernel but rather rely on the one nix built; this does trigger some warnings as the nix-built kernel and the nvidia modules have been built using different compilers which is dangerous territory; at some point in the future we need to see how we could build both the kernel and the modules in the same environment. (Which likely means not using nix to build the kernel.) For now, the modules can be loaded just fine and `nvidia-smi` detects the GPU. Bug: 289334314 Change-Id: I4b90b7207429e7729af74cd87c79ea8986f1de11 --- flake.nix | 10 ++++++++- oak_containers_system_image/build-base.sh | 9 +++++++- .../nvidia_base_image.Dockerfile | 22 +++++++++++++++++-- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/flake.nix b/flake.nix index 8cc9b7bf231..d28fbc376a6 100644 --- a/flake.nix +++ b/flake.nix @@ -30,6 +30,7 @@ url = "https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-${linux_kernel_version}.tar.xz"; sha256 = "01b414ba98fd189ecd544435caf3860ae2a790e3ec48f5aa70fdf42dc4c5c04a"; }; + linux_kernel_config = ./oak_containers_kernel/configs/${linux_kernel_version}/minimal.config; # Build the linux kernel for Oak Containers as a nix package, which simplifies # reproducibility. # Note that building a package via nix is not by itself a guarantee of @@ -40,7 +41,7 @@ # - CONFIG_MODULE_SIG is not set # - CONFIG_MODULE_SIG_ALL is not set # - CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT is not set - configfile = ./oak_containers_kernel/configs/6.9.1/minimal.config; + configfile = linux_kernel_config; # And also the following build variables. # See https://docs.kernel.org/kbuild/reproducible-builds.html. extraMakeFlags = [ @@ -190,9 +191,16 @@ # Shell for building Oak Containers kernel and system image. This is not included in the # default shell because it is not needed as part of the CI. containers = with pkgs; mkShell { + # We need access to the kernel source and configuration, not just the binaries, to + # build the system image with nvidia drivers in it. + # See oak_containers_system_image/build-base.sh (and nvidia_base_image.Dockerfile) for + # more details. shellHook = '' export LINUX_KERNEL="${linux_kernel}" export VANILLA_LINUX_KERNEL="${vanilla_linux_kernel}" + export LINUX_KERNEL_VERSION="${linux_kernel_version}" + export LINUX_KERNEL_SOURCE="${linux_kernel_src}" + export LINUX_KERNEL_CONFIG="${linux_kernel_config}" ''; inputsFrom = [ base diff --git a/oak_containers_system_image/build-base.sh b/oak_containers_system_image/build-base.sh index c2e7466a7dd..3824cbb5473 100755 --- a/oak_containers_system_image/build-base.sh +++ b/oak_containers_system_image/build-base.sh @@ -16,7 +16,14 @@ cd "$SCRIPTS_DIR" mkdir --parent target docker buildx build . --tag=oak-containers-sysimage-base:latest --file base_image.Dockerfile -docker buildx build . --tag=oak-containers-sysimage-nvidia-base:latest --file nvidia_base_image.Dockerfile + +cp --force "$LINUX_KERNEL_SOURCE" target/linux-"$LINUX_KERNEL_VERSION".tar.xz +cp --force "$LINUX_KERNEL_CONFIG" target/minimal.config + +docker buildx build . \ + --build-arg LINUX_KERNEL_VERSION="$LINUX_KERNEL_VERSION" \ + --tag=oak-containers-sysimage-nvidia-base:latest \ + --file nvidia_base_image.Dockerfile # We need to actually create a container, otherwise we won't be able to use # `docker export` that gives us a filesystem image. diff --git a/oak_containers_system_image/nvidia_base_image.Dockerfile b/oak_containers_system_image/nvidia_base_image.Dockerfile index 67718e278cd..3fb716819a8 100644 --- a/oak_containers_system_image/nvidia_base_image.Dockerfile +++ b/oak_containers_system_image/nvidia_base_image.Dockerfile @@ -2,6 +2,8 @@ # hadolint ignore=DL3007 FROM oak-containers-sysimage-base:latest +ARG LINUX_KERNEL_VERSION + # We need to enable `contrib` for `nvidia-support`. RUN sed -i -e '/^Components: main/cComponents: main contrib' \ /etc/apt/sources.list.d/debian.sources @@ -16,7 +18,23 @@ RUN curl -O -L https://developer.download.nvidia.com/compute/cuda/repos/debian12 RUN apt-get --yes update \ && apt-get install --yes --no-install-recommends \ - cuda-toolkit-12-4 nvidia-driver \ + nvidia-driver nvidia-smi \ + # Stuff to build kernel (will be purged later, see below) + libc6-dev flex bison build-essential bc cpio libncurses5-dev libelf-dev libssl-dev dwarves debhelper-compat rsync \ # Cleanup && apt-get clean \ - && rm --recursive --force /var/lib/apt/lists/* \ No newline at end of file + && rm --recursive --force /var/lib/apt/lists/* + +COPY target/linux-${LINUX_KERNEL_VERSION}.tar.xz /tmp +COPY target/minimal.config /tmp + +RUN tar --directory=/tmp --extract --file /tmp/linux-${LINUX_KERNEL_VERSION}.tar.xz \ + && cp /tmp/minimal.config /tmp/linux-${LINUX_KERNEL_VERSION}/.config \ + && make --directory=/tmp/linux-${LINUX_KERNEL_VERSION} bindeb-pkg \ + && dpkg --install /tmp/linux-headers-${LINUX_KERNEL_VERSION}_${LINUX_KERNEL_VERSION}-1_amd64.deb \ + && dkms build -m nvidia-current -v "$(dpkg-query --showformat='${source:Upstream-Version}' --show nvidia-driver)" -k ${LINUX_KERNEL_VERSION} \ + && dkms install -m nvidia-current -v "$(dpkg-query --showformat='${source:Upstream-Version}' --show nvidia-driver)" -k ${LINUX_KERNEL_VERSION} \ + && rm -rf /tmp/linux-${LINUX_KERNEL_VERSION} /tmp/linux-${LINUX_KERNEL_VERSION}.tar.xz /tmp/minimal.config \ + && apt-get --yes purge libc6-dev flex bison build-essential bc cpio libncurses5-dev libelf-dev libssl-dev dwarves debhelper-compat rsync \ + && apt-get --yes autoremove +