Skip to content

Commit

Permalink
Revert "Fix: Upgrade to Cuda 12.4, use defaults for missing env vars,…
Browse files Browse the repository at this point in the history
… include more libs (#898)"

This reverts commit 5a5a0cd.
  • Loading branch information
luke-lombardi committed Feb 3, 2025
1 parent abadd52 commit 0e17877
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 37 deletions.
4 changes: 1 addition & 3 deletions docker/Dockerfile.worker
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ RUN go build -o /usr/local/bin/worker ./cmd/worker/main.go

# final image
# ========================
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS release
FROM nvidia/cuda:12.3.1-base-ubuntu22.04 AS release
FROM release AS dev

FROM ${BASE_STAGE} AS final
Expand All @@ -109,8 +109,6 @@ RUN apt-get update && \
apt-get update && \
apt-get install psmisc

RUN apt-get install -y cuda-nvcc-12-4 libnvidia-compute-550 libnvidia-encode-550 libnvidia-decode-550 libnvidia-extra-550 nvidia-utils-550

RUN curl -L https://beam-runner-python-deps.s3.amazonaws.com/juicefs -o /usr/local/bin/juicefs && chmod +x /usr/local/bin/juicefs
RUN curl -fsSL https://tailscale.com/install.sh | sh
RUN apt-get install -y --no-install-recommends nvidia-container-toolkit-base nvidia-container-toolkit
Expand Down
26 changes: 11 additions & 15 deletions pkg/worker/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
)

var (
defaultContainerCudaVersion string = "12.4"
defaultContainerCudaVersion string = "12.3"
defaultContainerPath []string = []string{"/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"}
defaultContainerLibrary []string = []string{"/usr/lib/x86_64-linux-gnu", "/usr/lib/worker/x86_64-linux-gnu", "/usr/local/nvidia/lib64"}
)
Expand Down Expand Up @@ -205,15 +205,15 @@ func minor(dev uint64) uint64 {

func (c *ContainerNvidiaManager) InjectEnvVars(env []string, options *ContainerOptions) ([]string, bool) {
existingCudaFound := false
cudaEnvVarDefaults := map[string]string{
"NVIDIA_DRIVER_CAPABILITIES": "all",
"NVIDIA_REQUIRE_CUDA": "",
"NVARCH": "",
"NV_CUDA_COMPAT_PACKAGE": "",
"NV_CUDA_CUDART_VERSION": "",
"CUDA_VERSION": "",
"GPU_TYPE": "",
"CUDA_HOME": fmt.Sprintf("/usr/local/cuda-%s", defaultContainerCudaVersion),
cudaEnvVarNames := []string{
"NVIDIA_DRIVER_CAPABILITIES",
"NVIDIA_REQUIRE_CUDA",
"NVARCH",
"NV_CUDA_COMPAT_PACKAGE",
"NV_CUDA_CUDART_VERSION",
"CUDA_VERSION",
"GPU_TYPE",
"CUDA_HOME",
}

initialEnvVars := make(map[string]string)
Expand Down Expand Up @@ -251,7 +251,7 @@ func (c *ContainerNvidiaManager) InjectEnvVars(env []string, options *ContainerO
}

var cudaEnvVars []string
for key, defaultValue := range cudaEnvVarDefaults {
for _, key := range cudaEnvVarNames {
cudaEnvVarValue := os.Getenv(key)

if existingCudaFound {
Expand All @@ -262,10 +262,6 @@ func (c *ContainerNvidiaManager) InjectEnvVars(env []string, options *ContainerO
}
}

if cudaEnvVarValue == "" {
cudaEnvVarValue = defaultValue
}

cudaEnvVars = append(cudaEnvVars, fmt.Sprintf("%s=%s", key, cudaEnvVarValue))
}

Expand Down
37 changes: 18 additions & 19 deletions pkg/worker/nvidia_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@ import (
"fmt"
"os"
"reflect"
"sort"
"strings"
"syscall"
"testing"

"github.com/opencontainers/runtime-spec/specs-go"
"github.com/tj/assert"
)

type GPUInfoClientForTest struct {
Expand Down Expand Up @@ -45,7 +44,7 @@ func TestInjectNvidiaEnvVarsNoCudaInImage(t *testing.T) {
// Set some environment variables to simulate NVIDIA settings
os.Setenv("NVIDIA_DRIVER_CAPABILITIES", "all")
os.Setenv("NVIDIA_REQUIRE_CUDA", "cuda>=9.0")
os.Setenv("CUDA_HOME", "/usr/local/cuda-12.4")
os.Setenv("CUDA_HOME", "/usr/local/cuda-12.3")

expectedEnv := []string{
"INITIAL=1",
Expand All @@ -56,21 +55,16 @@ func TestInjectNvidiaEnvVarsNoCudaInImage(t *testing.T) {
"NV_CUDA_CUDART_VERSION=",
"CUDA_VERSION=",
"GPU_TYPE=",
"CUDA_HOME=/usr/local/cuda-12.4",
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/cuda-12.4/bin:$PATH",
"LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/worker/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/cuda-12.4/targets/x86_64-linux/lib:$LD_LIBRARY_PATH",
"CUDA_HOME=/usr/local/cuda-12.3",
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/cuda-12.3/bin:$PATH",
"LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/worker/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/cuda-12.3/targets/x86_64-linux/lib:$LD_LIBRARY_PATH",
}

resultEnv, _ := manager.InjectEnvVars(initialEnv, &ContainerOptions{
InitialSpec: &specs.Spec{
Process: &specs.Process{},
},
})

// Sort both slices before comparison
sort.Strings(expectedEnv)
sort.Strings(resultEnv)

if !reflect.DeepEqual(expectedEnv, resultEnv) {
t.Errorf("Expected %v, got %v", expectedEnv, resultEnv)
}
Expand All @@ -83,26 +77,31 @@ func TestInjectNvidiaEnvVarsExistingCudaInImage(t *testing.T) {
// Set some environment variables to simulate NVIDIA settings
os.Setenv("NVIDIA_DRIVER_CAPABILITIES", "all")
os.Setenv("NVIDIA_REQUIRE_CUDA", "cuda>=9.0")
os.Setenv("CUDA_VERSION", "12.4")
os.Setenv("CUDA_VERSION", "12.3")

expectedEnv := []string{
"INITIAL=1",
"NVIDIA_REQUIRE_CUDA=",
"CUDA_VERSION=12.4.1",
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/cuda-12.4/bin:$PATH",
"LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/worker/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/cuda-12.4/targets/x86_64-linux/lib:$LD_LIBRARY_PATH",
"CUDA_VERSION=11.8.2",
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/cuda-11.8/bin:$PATH",
"LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/worker/x86_64-linux-gnu:/usr/local/nvidia/lib64:/usr/local/cuda-11.8/targets/x86_64-linux/lib:$LD_LIBRARY_PATH",
}

resultEnv, _ := manager.InjectEnvVars(initialEnv, &ContainerOptions{
InitialSpec: &specs.Spec{
Process: &specs.Process{Env: []string{"NVIDIA_REQUIRE_CUDA=", "CUDA_VERSION=12.4.1"}},
Process: &specs.Process{Env: []string{"NVIDIA_REQUIRE_CUDA=", "CUDA_VERSION=11.8.2"}},
},
})

sort.Strings(expectedEnv)
sort.Strings(resultEnv)
expectedEnvStr := strings.Join(expectedEnv, "")
resultEnvStr := strings.Join(resultEnv, "")

expectedEnvStr = strings.ReplaceAll(expectedEnvStr, " ", "")
resultEnvStr = strings.ReplaceAll(resultEnvStr, " ", "")

assert.Equal(t, expectedEnv, resultEnv)
if expectedEnvStr != resultEnvStr {
t.Errorf("Expected %v, got %v", expectedEnv, resultEnv)
}
}

func TestInjectNvidiaMounts(t *testing.T) {
Expand Down

0 comments on commit 0e17877

Please sign in to comment.