From 2adc490790abe1b079a13bba734369002bcb18ec Mon Sep 17 00:00:00 2001 From: fedora Cloud User Date: Thu, 23 May 2024 20:53:31 +0000 Subject: [PATCH] Nvidia settings API for container runtime Signed-off-by: Monirul Islam --- Release.toml | 1 + .../nvidia-container-toolkit-config-k8s | 12 ++++++++++ .../nvidia-container-toolkit-config-k8s.toml | 8 ------- ...nvidia-container-toolkit-tmpfiles-k8s.conf | 2 +- .../nvidia-container-toolkit.spec | 4 ++-- sources/Cargo.lock | 8 +++++++ sources/Cargo.toml | 1 + .../container-runtime-nvidia-k8s/Cargo.toml | 15 ++++++++++++ .../container-runtime-nvidia-k8s/build.rs | 6 +++++ .../container-runtime-nvidia-k8s/src/main.rs | 24 +++++++++++++++++++ .../nvidia-k8s-container-toolkit.toml | 14 +++++++++++ .../81-nvidia-k8s-container-toolkit.toml | 1 + sources/models/src/lib.rs | 12 ++++++++++ 13 files changed, 97 insertions(+), 11 deletions(-) create mode 100644 packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s delete mode 100644 packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s.toml create mode 100644 sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/Cargo.toml create mode 100644 sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/build.rs create mode 100644 sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/src/main.rs create mode 100644 sources/models/shared-defaults/nvidia-k8s-container-toolkit.toml create mode 120000 sources/models/src/aws-k8s-1.30-nvidia/defaults.d/81-nvidia-k8s-container-toolkit.toml diff --git a/Release.toml b/Release.toml index 4a5584e9628..c08ae74224d 100644 --- a/Release.toml +++ b/Release.toml @@ -314,4 +314,5 @@ version = "1.21.0" ] "(1.20.0, 1.21.0)" = [ "migrate_v1.21.0_pluto-remove-generators-v0-1-0.lz4", + "migrate_v1.21.0_container-runtime-nvidia-k8s.lz4" ] diff --git a/packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s b/packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s new file mode 100644 index 00000000000..16c756c32d6 --- /dev/null +++ b/packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s @@ -0,0 +1,12 @@ +[required-extensions] +kubernetes = "v1" + ++++ +accept-nvidia-visible-devices-as-volume-mounts = {{settings.kubernetes.nvidia.container-runtime.visible-devices-as-volume-mounts}} +accept-nvidia-visible-devices-envvar-when-unprivileged = {{settings.kubernetes.nvidia.container-runtime.visible-devices-envvar-when-unprivileged}} + +[nvidia-container-cli] +root = "/" +path = "/usr/bin/nvidia-container-cli" +environment = [] +ldconfig = "@/sbin/ldconfig" diff --git a/packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s.toml b/packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s.toml deleted file mode 100644 index 9ef532c8962..00000000000 --- a/packages/nvidia-container-toolkit/nvidia-container-toolkit-config-k8s.toml +++ /dev/null @@ -1,8 +0,0 @@ -accept-nvidia-visible-devices-as-volume-mounts = true -accept-nvidia-visible-devices-envvar-when-unprivileged = false - -[nvidia-container-cli] -root = "/" -path = "/usr/bin/nvidia-container-cli" -environment = [] -ldconfig = "@/sbin/ldconfig" diff --git a/packages/nvidia-container-toolkit/nvidia-container-toolkit-tmpfiles-k8s.conf b/packages/nvidia-container-toolkit/nvidia-container-toolkit-tmpfiles-k8s.conf index 011192e391b..f669cd3cc37 100644 --- a/packages/nvidia-container-toolkit/nvidia-container-toolkit-tmpfiles-k8s.conf +++ b/packages/nvidia-container-toolkit/nvidia-container-toolkit-tmpfiles-k8s.conf @@ -1 +1 @@ -C /etc/nvidia-container-runtime/config.toml - - - - /usr/share/factory/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml +d /etc/nvidia-container-runtime - - - - - diff --git a/packages/nvidia-container-toolkit/nvidia-container-toolkit.spec b/packages/nvidia-container-toolkit/nvidia-container-toolkit.spec index 93412ba7757..ca35b441c5c 100644 --- a/packages/nvidia-container-toolkit/nvidia-container-toolkit.spec +++ b/packages/nvidia-container-toolkit/nvidia-container-toolkit.spec @@ -13,7 +13,7 @@ License: Apache-2.0 URL: https://%{goimport} Source0: https://%{goimport}/archive/v%{gover}/nvidia-container-toolkit-%{gover}.tar.gz -Source1: nvidia-container-toolkit-config-k8s.toml +Source1: nvidia-container-toolkit-config-k8s Source2: nvidia-container-toolkit-config-ecs.toml Source3: nvidia-oci-hooks-json Source4: nvidia-gpu-devices.rules @@ -82,5 +82,5 @@ ln -s shimpei %{buildroot}%{_cross_bindir}/nvidia-oci %{_cross_tmpfilesdir}/nvidia-container-toolkit-ecs.conf %files k8s -%{_cross_factorydir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml +%{_cross_factorydir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s %{_cross_tmpfilesdir}/nvidia-container-toolkit-k8s.conf diff --git a/sources/Cargo.lock b/sources/Cargo.lock index 5e762d9de24..6c5619bb72a 100644 --- a/sources/Cargo.lock +++ b/sources/Cargo.lock @@ -1434,6 +1434,14 @@ dependencies = [ "migration-helpers", ] +[[package]] +name = "container-runtime-nvidia-k8s" +version = "0.1.0" +dependencies = [ + "bottlerocket-variant", + "migration-helpers", +] + [[package]] name = "convert_case" version = "0.4.0" diff --git a/sources/Cargo.toml b/sources/Cargo.toml index 504161e8360..f5581dd459d 100644 --- a/sources/Cargo.toml +++ b/sources/Cargo.toml @@ -91,6 +91,7 @@ members = [ "api/migration/migrations/v1.20.0/aws-control-container-v0-7-12", "api/migration/migrations/v1.20.0/public-control-container-v0-7-12", "api/migration/migrations/v1.21.0/pluto-remove-generators-v0-1-0", + "api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s", "bloodhound", diff --git a/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/Cargo.toml b/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/Cargo.toml new file mode 100644 index 00000000000..a20742b0500 --- /dev/null +++ b/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "container-runtime-nvidia-k8s" +version = "0.1.0" +edition = "2021" +authors = ["Monirul Islam "] +license = "Apache-2.0 OR MIT" +publish = false +# Don't rebuild crate just because of changes to README. +exclude = ["README.md"] + +[dependencies] +migration-helpers = { path = "../../../migration-helpers", version = "0.1.0"} + +[build-dependencies] +bottlerocket-variant = { version = "0.1", path = "../../../../../bottlerocket-variant" } diff --git a/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/build.rs b/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/build.rs new file mode 100644 index 00000000000..51d16cf1b4c --- /dev/null +++ b/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/build.rs @@ -0,0 +1,6 @@ +use bottlerocket_variant::Variant; + +fn main() { + let variant = Variant::from_env().unwrap(); + variant.emit_cfgs(); +} diff --git a/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/src/main.rs b/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/src/main.rs new file mode 100644 index 00000000000..462e15ddab3 --- /dev/null +++ b/sources/api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s/src/main.rs @@ -0,0 +1,24 @@ +use migration_helpers::common_migrations::{AddPrefixesMigration, NoOpMigration}; +use migration_helpers::{migrate, Result}; +use std::process; + +/// We added a new setting for configuring container runtime (containerd) settings only for NVIDIA k8s variants. +fn run() -> Result<()> { + if cfg!(variant_family = "aws-k8s") && cfg!(variant_flavor = "nvidia") { + migrate(AddPrefixesMigration(vec![ + "settings.kubernetes.nvidia.container-runtime", + ])) + } else { + migrate(NoOpMigration) + } +} + +// Returning a Result from main makes it print a Debug representation of the error, but with Snafu +// we have nice Display representations of the error, so we wrap "main" (run) and print any error. +// https://github.com/shepmaster/snafu/issues/110 +fn main() { + if let Err(e) = run() { + eprintln!("{}", e); + process::exit(1); + } +} diff --git a/sources/models/shared-defaults/nvidia-k8s-container-toolkit.toml b/sources/models/shared-defaults/nvidia-k8s-container-toolkit.toml new file mode 100644 index 00000000000..a874ba0e56e --- /dev/null +++ b/sources/models/shared-defaults/nvidia-k8s-container-toolkit.toml @@ -0,0 +1,14 @@ +[settings.kubernetes.nvidia.container-runtime] +visible-devices-as-volume-mounts = false +visible-devices-envvar-when-unprivileged = true + +[metadata.settings.kubernetes.nvidia.container-runtime] +affected-services = ["nvidia-container-toolkit"] + +[services.nvidia-container-toolkit] +configuration-files = ["nvidia-container-toolkit"] +restart-commands = [] + +[configuration-files.nvidia-container-toolkit] +path = "/etc/nvidia-container-runtime/config.toml" +template-path = "/usr/share/factory/nvidia-container-runtime/nvidia-container-toolkit-config-k8s" diff --git a/sources/models/src/aws-k8s-1.30-nvidia/defaults.d/81-nvidia-k8s-container-toolkit.toml b/sources/models/src/aws-k8s-1.30-nvidia/defaults.d/81-nvidia-k8s-container-toolkit.toml new file mode 120000 index 00000000000..0b3f42928f1 --- /dev/null +++ b/sources/models/src/aws-k8s-1.30-nvidia/defaults.d/81-nvidia-k8s-container-toolkit.toml @@ -0,0 +1 @@ +../../../shared-defaults/nvidia-k8s-container-toolkit.toml \ No newline at end of file diff --git a/sources/models/src/lib.rs b/sources/models/src/lib.rs index aa1a08f7bf7..588b3b7d953 100644 --- a/sources/models/src/lib.rs +++ b/sources/models/src/lib.rs @@ -317,6 +317,7 @@ struct KubernetesSettings { hostname_override: ValidLinuxHostname, // Generated in `k8s-1.25+` variants only seccomp_default: bool, + nvidia: K8sNvidiaSettings, } // ECS settings. @@ -572,3 +573,14 @@ struct Report { name: String, description: String, } + +#[model] +struct K8sNvidiaSettings { + container_runtime: K8sContainerRuntimeSettings, +} + +#[model] +struct K8sContainerRuntimeSettings { + visible_devices_as_volume_mounts: bool, + visible_devices_envvar_when_unprivileged: bool, +}