Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nvidia settings API for container runtime #3994

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -314,4 +314,5 @@ version = "1.21.0"
]
"(1.20.0, 1.21.0)" = [
"migrate_v1.21.0_pluto-remove-generators-v0-1-0.lz4",
"migrate_v1.21.0_container-runtime-nvidia-k8s.lz4"
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[required-extensions]
kubernetes = "v1"

+++
accept-nvidia-visible-devices-as-volume-mounts = {{settings.kubernetes.nvidia.container-runtime.visible-devices-as-volume-mounts}}
accept-nvidia-visible-devices-envvar-when-unprivileged = {{settings.kubernetes.nvidia.container-runtime.visible-devices-envvar-when-unprivileged}}
Comment on lines +5 to +6
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's be safe, and use the {{default}} helper, otherwise if settings.kubernetes.nvidia.container-runtime.visible-devices-as-volume-mounts isn't present, the render will fail.


[nvidia-container-cli]
root = "/"
path = "/usr/bin/nvidia-container-cli"
environment = []
ldconfig = "@/sbin/ldconfig"

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1 +1 @@
C /etc/nvidia-container-runtime/config.toml - - - - /usr/share/factory/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml
d /etc/nvidia-container-runtime - - - - -
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ License: Apache-2.0
URL: https://%{goimport}

Source0: https://%{goimport}/archive/v%{gover}/nvidia-container-toolkit-%{gover}.tar.gz
Source1: nvidia-container-toolkit-config-k8s.toml
Source1: nvidia-container-toolkit-config-k8s
Source2: nvidia-container-toolkit-config-ecs.toml
Source3: nvidia-oci-hooks-json
Source4: nvidia-gpu-devices.rules
Expand Down Expand Up @@ -82,5 +82,5 @@ ln -s shimpei %{buildroot}%{_cross_bindir}/nvidia-oci
%{_cross_tmpfilesdir}/nvidia-container-toolkit-ecs.conf

%files k8s
%{_cross_factorydir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml
%{_cross_factorydir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s
%{_cross_tmpfilesdir}/nvidia-container-toolkit-k8s.conf
8 changes: 8 additions & 0 deletions sources/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions sources/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ members = [
"api/migration/migrations/v1.20.0/aws-control-container-v0-7-12",
"api/migration/migrations/v1.20.0/public-control-container-v0-7-12",
"api/migration/migrations/v1.21.0/pluto-remove-generators-v0-1-0",
"api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s",

"bloodhound",

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "container-runtime-nvidia-k8s"
version = "0.1.0"
edition = "2021"
authors = ["Monirul Islam <[email protected]>"]
license = "Apache-2.0 OR MIT"
publish = false
# Don't rebuild crate just because of changes to README.
exclude = ["README.md"]

[dependencies]
migration-helpers = { path = "../../../migration-helpers", version = "0.1.0"}

[build-dependencies]
bottlerocket-variant = { version = "0.1", path = "../../../../../bottlerocket-variant" }
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
use bottlerocket_variant::Variant;

fn main() {
let variant = Variant::from_env().unwrap();
variant.emit_cfgs();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
use migration_helpers::common_migrations::{AddPrefixesMigration, NoOpMigration};
use migration_helpers::{migrate, Result};
use std::process;

/// We added a new setting for configuring container runtime (containerd) settings only for NVIDIA k8s variants.
fn run() -> Result<()> {
if cfg!(variant_family = "aws-k8s") && cfg!(variant_flavor = "nvidia") {
migrate(AddPrefixesMigration(vec![
"settings.kubernetes.nvidia.container-runtime",
]))
} else {
migrate(NoOpMigration)
}
}

// Returning a Result from main makes it print a Debug representation of the error, but with Snafu
// we have nice Display representations of the error, so we wrap "main" (run) and print any error.
// https://github.com/shepmaster/snafu/issues/110
fn main() {
if let Err(e) = run() {
eprintln!("{}", e);
process::exit(1);
}
}
14 changes: 14 additions & 0 deletions sources/models/shared-defaults/nvidia-k8s-container-toolkit.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[settings.kubernetes.nvidia.container-runtime]
visible-devices-as-volume-mounts = false
visible-devices-envvar-when-unprivileged = true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets use the default values to prevent unprivileged pods from accessing all the devices:

accept-nvidia-visible-devices-envvar-when-unprivileged = false


[metadata.settings.kubernetes.nvidia.container-runtime]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is missing a migration to remove the affected services on a downgrade.

affected-services = ["nvidia-container-toolkit"]

[services.nvidia-container-toolkit]
configuration-files = ["nvidia-container-toolkit"]
restart-commands = []

[configuration-files.nvidia-container-toolkit]
path = "/etc/nvidia-container-runtime/config.toml"
template-path = "/usr/share/factory/nvidia-container-runtime/nvidia-container-toolkit-config-k8s"
12 changes: 12 additions & 0 deletions sources/models/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ struct KubernetesSettings {
hostname_override: ValidLinuxHostname,
// Generated in `k8s-1.25+` variants only
seccomp_default: bool,
nvidia: K8sNvidiaSettings,
}

// ECS settings.
Expand Down Expand Up @@ -572,3 +573,14 @@ struct Report {
name: String,
description: String,
}

#[model]
struct K8sNvidiaSettings {
container_runtime: K8sContainerRuntimeSettings,
}

#[model]
struct K8sContainerRuntimeSettings {
visible_devices_as_volume_mounts: bool,
visible_devices_envvar_when_unprivileged: bool,
}