Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nvidia container-runtime API for GPU allocation #4052

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -319,4 +319,6 @@ version = "1.21.0"
"migrate_v1.21.0_pod-infra-container-image-remove-settings-generator.lz4",
"migrate_v1.21.0_pod-infra-container-image-affected-services.lz4",
"migrate_v1.21.0_pod-infra-container-image-services.lz4",
"migrate_v1.21.0_container-runtime-nvidia-k8s.lz4",
"migrate_v1.21.0_container-runtime-nvidia-k8s-metadata.lz4",
]
1 change: 1 addition & 0 deletions packages/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.patch.bz2
*.src.rpm
*.zip
*.rpm
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[required-extensions]
kubernetes = "v1"
std = { version = "v1", helpers = ["default"] }

+++
accept-nvidia-visible-devices-as-volume-mounts = {{default true settings.kubernetes.nvidia.container-runtime.visible-devices-as-volume-mounts}}
accept-nvidia-visible-devices-envvar-when-unprivileged = {{default false settings.kubernetes.nvidia.container-runtime.visible-devices-envvar-when-unprivileged}}

[nvidia-container-cli]
root = "/"
path = "/usr/bin/nvidia-container-cli"
environment = []
ldconfig = "@/sbin/ldconfig"

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1 +1 @@
C /etc/nvidia-container-runtime/config.toml - - - - /usr/share/factory/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml
d /etc/nvidia-container-runtime - - - - -
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ License: Apache-2.0
URL: https://%{goimport}

Source0: https://%{goimport}/archive/v%{gover}/nvidia-container-toolkit-%{gover}.tar.gz
Source1: nvidia-container-toolkit-config-k8s.toml
Source1: nvidia-container-toolkit-config-k8s
Source2: nvidia-container-toolkit-config-ecs.toml
Source3: nvidia-oci-hooks-json
Source4: nvidia-gpu-devices.rules
Expand Down Expand Up @@ -82,5 +82,5 @@ ln -s shimpei %{buildroot}%{_cross_bindir}/nvidia-oci
%{_cross_tmpfilesdir}/nvidia-container-toolkit-ecs.conf

%files k8s
%{_cross_factorydir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml
%{_cross_factorydir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s
%{_cross_tmpfilesdir}/nvidia-container-toolkit-k8s.conf
16 changes: 16 additions & 0 deletions sources/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions sources/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ members = [
"api/migration/migrations/v1.21.0/pod-infra-container-image-affected-services",
"api/migration/migrations/v1.21.0/pod-infra-container-image-remove-settings-generator",
"api/migration/migrations/v1.21.0/pod-infra-container-image-services",
"api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s",
"api/migration/migrations/v1.21.0/container-runtime-nvidia-k8s-metadata",

"bloodhound",

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "container-runtime-nvidia-k8s-metadata"
version = "0.1.0"
edition = "2021"
authors = ["Yutong Sun <[email protected]>"]
license = "Apache-2.0 OR MIT"
publish = false
# Don't rebuild crate just because of changes to README.
exclude = ["README.md"]

[dependencies]
migration-helpers = { path = "../../../migration-helpers", version = "0.1.0"}

[build-dependencies]
bottlerocket-variant = { version = "0.1", path = "../../../../../bottlerocket-variant" }
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
use bottlerocket_variant::Variant;

fn main() {
let variant = Variant::from_env().unwrap();
variant.emit_cfgs();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
use migration_helpers::common_migrations::{AddMetadataMigration, NoOpMigration, SettingMetadata};
use migration_helpers::migrate;
use migration_helpers::Result;
use std::process;

/// We added a new setting for configuring container runtime (containerd) settings only for NVIDIA k8s variants.
fn run() -> Result<()> {
if cfg!(variant_family = "aws-k8s") && cfg!(variant_flavor = "nvidia") {
migrate(AddMetadataMigration(&[SettingMetadata {
metadata: &["affected-services"],
setting: "settings.kubernetes.nvidia.container-runtime",
}]))
} else {
migrate(NoOpMigration)
}
}

// Returning a Result from main makes it print a Debug representation of the error, but with Snafu
// we have nice Display representations of the error, so we wrap "main" (run) and print any error.
// https://github.com/shepmaster/snafu/issues/110
fn main() {
if let Err(e) = run() {
eprintln!("{}", e);
process::exit(1);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "container-runtime-nvidia-k8s"
version = "0.1.0"
edition = "2021"
authors = ["Monirul Islam <[email protected]>"]
license = "Apache-2.0 OR MIT"
publish = false
# Don't rebuild crate just because of changes to README.
exclude = ["README.md"]

[dependencies]
migration-helpers = { path = "../../../migration-helpers", version = "0.1.0"}

[build-dependencies]
bottlerocket-variant = { version = "0.1", path = "../../../../../bottlerocket-variant" }
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
use bottlerocket_variant::Variant;

fn main() {
let variant = Variant::from_env().unwrap();
variant.emit_cfgs();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
use migration_helpers::common_migrations::{AddPrefixesMigration, NoOpMigration};
use migration_helpers::{migrate, Result};
use std::process;

/// We added a new setting for configuring container runtime (containerd) settings only for NVIDIA k8s variants.
fn run() -> Result<()> {
if cfg!(variant_family = "aws-k8s") && cfg!(variant_flavor = "nvidia") {
migrate(AddPrefixesMigration(vec![
"settings.kubernetes.nvidia.container-runtime",
]))
} else {
migrate(NoOpMigration)
}
}

// Returning a Result from main makes it print a Debug representation of the error, but with Snafu
// we have nice Display representations of the error, so we wrap "main" (run) and print any error.
// https://github.com/shepmaster/snafu/issues/110
fn main() {
if let Err(e) = run() {
eprintln!("{}", e);
process::exit(1);
}
}
14 changes: 14 additions & 0 deletions sources/models/shared-defaults/nvidia-k8s-container-toolkit.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[settings.kubernetes.nvidia.container-runtime]
visible-devices-as-volume-mounts = true
visible-devices-envvar-when-unprivileged = false

[metadata.settings.kubernetes.nvidia.container-runtime]
affected-services = ["nvidia-container-toolkit"]

[services.nvidia-container-toolkit]
configuration-files = ["nvidia-container-toolkit"]
restart-commands = []

[configuration-files.nvidia-container-toolkit]
path = "/etc/nvidia-container-runtime/config.toml"
template-path = "/usr/share/factory/nvidia-container-runtime/nvidia-container-toolkit-config-k8s"
12 changes: 12 additions & 0 deletions sources/models/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ struct KubernetesSettings {
hostname_override: ValidLinuxHostname,
// Generated in `k8s-1.25+` variants only
seccomp_default: bool,
nvidia: K8sNvidiaSettings,
}

// ECS settings.
Expand Down Expand Up @@ -562,3 +563,14 @@ struct Report {
name: String,
description: String,
}

#[model]
struct K8sNvidiaSettings {
container_runtime: K8sContainerRuntimeSettings,
}

#[model]
struct K8sContainerRuntimeSettings {
visible_devices_as_volume_mounts: bool,
visible_devices_envvar_when_unprivileged: bool,
}