From 63ecbb42cc1d9159b56fd39586167d0bc460cbb5 Mon Sep 17 00:00:00 2001 From: Andri Saar Date: Fri, 31 May 2024 17:56:45 +0000 Subject: [PATCH] Create a flavour of Containers base image that includes nvidia drivers I'm pretty sure this won't work as-is, as we're not running the default Debian kernel, but what I'm hoping to achieve with this is to get the build infrastructure working. Bug: 289334314 Change-Id: I15499e795c2d3412830f4fe539dd2afa191b1f7e --- oak_containers_system_image/BUILD | 44 +++++++++++++++++++ oak_containers_system_image/README.md | 5 +++ oak_containers_system_image/build-base.sh | 12 +++++ .../nvidia_base_image.Dockerfile | 22 ++++++++++ 4 files changed, 83 insertions(+) create mode 100644 oak_containers_system_image/nvidia_base_image.Dockerfile diff --git a/oak_containers_system_image/BUILD b/oak_containers_system_image/BUILD index 398fbd6077d..c07d1398ab1 100644 --- a/oak_containers_system_image/BUILD +++ b/oak_containers_system_image/BUILD @@ -49,6 +49,14 @@ oci_image( tars = [":rust_bins_tar"], ) +oci_image( + name = "oak_containers_nvidia_system_image_oci_image", + base = "@oak_containers_nvidia_sysimage_base", + # This rule will fail unless build-bazel.sh has been run. + tags = ["noci"], + tars = [":rust_bins_tar"], +) + oci_runtime_bundle( name = "oak_containers_system_image", image = ":oak_containers_system_image_oci_image", @@ -57,6 +65,14 @@ oci_runtime_bundle( tags = ["noci"], ) +oci_runtime_bundle( + name = "oak_containers_nvidia_system_image", + image = ":oak_containers_nvidia_system_image_oci_image", + rootfs_only = True, + # This rule will fail unless build-bazel.sh has been run. + tags = ["noci"], +) + ### Base Image Update Targets ### These can't yet be run automatically. First, the build-base.sh script must be run. @@ -70,6 +86,15 @@ filegroup( tags = ["noci"], ) +filegroup( + name = "nvidia_base_image_tar", + srcs = [ + "target/nvidia-base-image.tar", + ], + # This rule will fail until build-base.sh has been run + tags = ["noci"], +) + # Defines labels added to :oak_containers_sysimage_base. When built with # `--stamp`, overrides in `stamp_substitutions` override those in # `substitutions`; `stamp_substitutions` can access workspace status values @@ -97,6 +122,16 @@ oci_image( tars = [":base_image_tar"], ) +oci_image( + name = "oak_containers_nvidia_sysimage_base", + architecture = "amd64", + labels = ":oak_containers_sysimage_base_labels", + os = "linux", + # This rule will fail until build-base.sh has been run + tags = ["noci"], + tars = [":nvidia_base_image_tar"], +) + # After running this target, you will need to update the hash for # oak_containers_sysimage_base in the WORKSPACE file to use it. oci_push( @@ -107,3 +142,12 @@ oci_push( # This rule will fail until build-base.sh has been run tags = ["noci"], ) + +oci_push( + name = "push_nvidia_base", + image = ":oak_containers_nvidia_sysimage_base", + remote_tags = ["latest"], + repository = "europe-west2-docker.pkg.dev/oak-ci/oak-containers-sysimage-base/oak-containers-nvidia-sysimage-base", + # This rule will fail until build-base.sh has been run + tags = ["noci"], +) diff --git a/oak_containers_system_image/README.md b/oak_containers_system_image/README.md index 55d9bcf0248..b9784513f49 100644 --- a/oak_containers_system_image/README.md +++ b/oak_containers_system_image/README.md @@ -34,6 +34,8 @@ To update the base image and push it: 1. ./oak_containers_system_image/build-base.sh 2. bazel run --stamp oak_containers_system_image:push_base +There is also a version of the base image that includes the nvidia drivers. + ## Bazel-Based System Image Tools `just oak_containers_system_image` and some `BUILD` targets @@ -66,3 +68,6 @@ How this works: base image built with the old way has all files in the top level. We'll probably need to mimic that structure. There are lots of ways to do this, but it's not clear what the most correct one is. + +- The version with nvidia drivers is still largely untested and under + development. diff --git a/oak_containers_system_image/build-base.sh b/oak_containers_system_image/build-base.sh index 24148d68217..bdcd021c1d0 100755 --- a/oak_containers_system_image/build-base.sh +++ b/oak_containers_system_image/build-base.sh @@ -16,20 +16,24 @@ cd "$SCRIPTS_DIR" mkdir --parent target docker buildx build . --tag=oak-containers-sysimage-base:latest --file base_image.Dockerfile +docker buildx build . --tag=oak-containers-sysimage-nvidia-base:latest --file nvidia_base_image.Dockerfile # We need to actually create a container, otherwise we won't be able to use # `docker export` that gives us a filesystem image. # (`docker save` creates a tarball which has all the layers separate, which is # _not_ what we want.) readonly NEW_DOCKER_CONTAINER_ID="$(docker create oak-containers-sysimage-base:latest)" +readonly NEW_NVIDIA_DOCKER_CONTAINER_ID="$(docker create oak-containers-sysimage-nvidia-base:latest)" # We export a plain tarball. # The oak_containers_sysimage_base oci_image rule will use this tarball to # create an OCI image that it can then push to Google artifact registry. # There *might* be a better approach here, but this is working for now. docker export "$NEW_DOCKER_CONTAINER_ID" > target/base-image.tar +docker export "$NEW_NVIDIA_DOCKER_CONTAINER_ID" > target/nvidia-base-image.tar docker rm "$NEW_DOCKER_CONTAINER_ID" +docker rm "$NEW_NVIDIA_DOCKER_CONTAINER_ID" # Repackage base-image.tar so that entries are in a consistent order and have a # consistent mtime. fakeroot ensures that file permissions are maintained, even @@ -47,6 +51,14 @@ fakeroot -- sh -c "\ --numeric-owner --directory \"${sandbox}\" ." rm -rf -- "$sandbox" +sandbox="$(mktemp -d)" +fakeroot -- sh -c "\ + tar --extract --file target/nvidia-base-image.tar --directory \"${sandbox}\" \ + && cp files/etc/hosts \"${sandbox}/etc/hosts\" \ + && tar --create --sort=name --file target/nvidia-base-image.tar --mtime='2000-01-01Z' \ + --numeric-owner --directory \"${sandbox}\" ." +rm -rf -- "$sandbox" + set +o xtrace printf "\n\nIf you want to push this newly created base, run:\n" printf "\nbazel run oak_containers_system_image:push_base\n\n" diff --git a/oak_containers_system_image/nvidia_base_image.Dockerfile b/oak_containers_system_image/nvidia_base_image.Dockerfile new file mode 100644 index 00000000000..67718e278cd --- /dev/null +++ b/oak_containers_system_image/nvidia_base_image.Dockerfile @@ -0,0 +1,22 @@ +# The expectation is that we build `base_image.Dockerfile` before this one. +# hadolint ignore=DL3007 +FROM oak-containers-sysimage-base:latest + +# We need to enable `contrib` for `nvidia-support`. +RUN sed -i -e '/^Components: main/cComponents: main contrib' \ + /etc/apt/sources.list.d/debian.sources + +RUN apt-get --yes update \ + && apt-get install --yes --no-install-recommends \ + curl ca-certificates + +RUN curl -O -L https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb \ + && dpkg -i cuda-keyring_1.1-1_all.deb \ + && rm -f cuda-keyring_1.1-1_all.deb + +RUN apt-get --yes update \ + && apt-get install --yes --no-install-recommends \ + cuda-toolkit-12-4 nvidia-driver \ + # Cleanup + && apt-get clean \ + && rm --recursive --force /var/lib/apt/lists/* \ No newline at end of file