Skip to content

Commit

Permalink
Merge branch 'master' into r2k1/no-ginkgo
Browse files Browse the repository at this point in the history
  • Loading branch information
r2k1 authored Oct 28, 2024
2 parents 9f101fe + 11c0b8b commit 3d2ee13
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 98 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/go-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ jobs:
go-version: '1.22'
- run: |
make test
name: Run unit tests for go code in pkg/agent
name: Run unit tests for go code in the repository
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,12 @@ endif
ginkgoBuild: generate
make -C ./test/e2e ginkgo-build

test:
test: test-node-bootstrapper
go test ./...

test-node-bootstrapper:
pushd node-bootstrapper && go test ./... && popd

.PHONY: test-style
test-style: validate-go validate-shell validate-copyright-headers

Expand Down
40 changes: 20 additions & 20 deletions parts/linux/cloud-init/artifacts/components.json
Original file line number Diff line number Diff line change
Expand Up @@ -532,28 +532,28 @@
"multiArchVersionsV2": [
{
"renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy",
"latestVersion": "v1.27.16",
"previousLatestVersion": "v1.27.15"
"latestVersion": "v1.27.100-akslts",
"previousLatestVersion": "v1.27.16"
},
{
"renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy",
"latestVersion": "v1.28.14",
"previousLatestVersion": "v1.28.13"
"latestVersion": "v1.28.15",
"previousLatestVersion": "v1.28.14"
},
{
"renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy",
"latestVersion": "v1.29.9",
"previousLatestVersion": "v1.29.8"
"latestVersion": "v1.29.10",
"previousLatestVersion": "v1.29.9"
},
{
"renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy",
"latestVersion": "v1.30.5",
"previousLatestVersion": "v1.30.4"
"latestVersion": "v1.30.6",
"previousLatestVersion": "v1.30.5"
},
{
"renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy",
"latestVersion": "v1.31.1",
"previousLatestVersion": "v1.31.0"
"latestVersion": "v1.31.2",
"previousLatestVersion": "v1.31.1"
}
]
}
Expand Down Expand Up @@ -777,32 +777,32 @@
{
"k8sVersion": "1.27",
"renovateTag": "<DO_NOT_UPDATE>",
"latestVersion": "1.27.16",
"previousLatestVersion": "1.27.15"
"latestVersion": "1.27.100-akslts",
"previousLatestVersion": "1.27.16"
},
{
"k8sVersion": "1.28",
"renovateTag": "<DO_NOT_UPDATE>",
"latestVersion": "1.28.14",
"previousLatestVersion": "1.28.13"
"latestVersion": "1.28.15",
"previousLatestVersion": "1.28.14"
},
{
"k8sVersion": "1.29",
"renovateTag": "<DO_NOT_UPDATE>",
"latestVersion": "1.29.9",
"previousLatestVersion": "1.29.8"
"latestVersion": "1.29.10",
"previousLatestVersion": "1.29.9"
},
{
"k8sVersion": "1.30",
"renovateTag": "<DO_NOT_UPDATE>",
"latestVersion": "1.30.5",
"previousLatestVersion": "1.30.4"
"latestVersion": "1.30.6",
"previousLatestVersion": "1.30.5"
},
{
"k8sVersion": "1.31",
"renovateTag": "<DO_NOT_UPDATE>",
"latestVersion": "1.31.1",
"previousLatestVersion": "1.31.0"
"latestVersion": "1.31.2",
"previousLatestVersion": "1.31.1"
}
],
"downloadURL": "https://acs-mirror.azureedge.net/kubernetes/v${version}/binaries/kubernetes-node-linux-${CPU_ARCH}.tar.gz"
Expand Down
4 changes: 2 additions & 2 deletions pkg/agent/baker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1040,12 +1040,12 @@ func getPortRangeEndValue(portRange string) int {
// NVv3 is untested on AKS, NVv4 is AMD so n/a, and NVv2 no longer seems to exist (?).
func GetGPUDriverVersion(size string) string {
if useGridDrivers(size) {
return datamodel.Nvidia535GridDriverVersion
return datamodel.NvidiaGridDriverVersion
}
if isStandardNCv1(size) {
return datamodel.Nvidia470CudaDriverVersion
}
return datamodel.Nvidia550CudaDriverVersion
return datamodel.NvidiaCudaDriverVersion
}

func isStandardNCv1(size string) bool {
Expand Down
8 changes: 4 additions & 4 deletions pkg/agent/baker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2141,15 +2141,15 @@ var _ = Describe("GetGPUDriverVersion", func() {
Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.Nvidia470CudaDriverVersion))
})
It("should use cuda with nc v3", func() {
Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.Nvidia550CudaDriverVersion))
Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaDriverVersion))
})
It("should use grid with nv v5", func() {
Expect(GetGPUDriverVersion("standard_nv6ads_a10_v5")).To(Equal(datamodel.Nvidia535GridDriverVersion))
Expect(GetGPUDriverVersion("Standard_nv36adms_A10_V5")).To(Equal(datamodel.Nvidia535GridDriverVersion))
Expect(GetGPUDriverVersion("standard_nv6ads_a10_v5")).To(Equal(datamodel.NvidiaGridDriverVersion))
Expect(GetGPUDriverVersion("Standard_nv36adms_A10_V5")).To(Equal(datamodel.NvidiaGridDriverVersion))
})
// NV V1 SKUs were retired in September 2023, leaving this test just for safety
It("should use cuda with nv v1", func() {
Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.Nvidia550CudaDriverVersion))
Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.NvidiaCudaDriverVersion))
})
})

Expand Down
62 changes: 0 additions & 62 deletions pkg/agent/datamodel/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,65 +131,3 @@ const (
EnableIPv6Only = "EnableIPv6Only"
EnableWinDSR = "EnableWinDSR"
)

const (
Nvidia470CudaDriverVersion = "cuda-470.82.01"
Nvidia550CudaDriverVersion = "550.90.12"
Nvidia535GridDriverVersion = "535.161.08"
)

// These SHAs will change once we update aks-gpu images in aks-gpu repository. We do that fairly rarely at this time.
// So for now these will be kept here like this.
const (
AKSGPUCudaVersionSuffix = "20241021235610"
AKSGPUGridVersionSuffix = "20241021235607"
)

/* convergedGPUDriverSizes : these sizes use a "converged" driver to support both cuda/grid workloads.
how do you figure this out? ask HPC or find out by trial and error.
installing vanilla cuda drivers will fail to install with opaque errors.
nvidia-bug-report.sh may be helpful, but usually it tells you the pci card id is incompatible.
That sends me to HPC folks.
see https://github.com/Azure/azhpc-extensions/blob/daaefd78df6f27012caf30f3b54c3bd6dc437652/NvidiaGPU/resources.json
*/
//nolint:gochecknoglobals
var ConvergedGPUDriverSizes = map[string]bool{
"standard_nv6ads_a10_v5": true,
"standard_nv12ads_a10_v5": true,
"standard_nv18ads_a10_v5": true,
"standard_nv36ads_a10_v5": true,
"standard_nv72ads_a10_v5": true,
"standard_nv36adms_a10_v5": true,
"standard_nc8ads_a10_v4": true,
"standard_nc16ads_a10_v4": true,
"standard_nc32ads_a10_v4": true,
}

//nolint:gochecknoglobals
var FabricManagerGPUSizes = map[string]bool{
// A100
"standard_nd96asr_v4": true,
"standard_nd112asr_a100_v4": true,
"standard_nd120asr_a100_v4": true,
"standard_nd96amsr_a100_v4": true,
"standard_nd112amsr_a100_v4": true,
"standard_nd120amsr_a100_v4": true,
// TODO(ace): one of these is probably dupe...
// confirm with HPC/SKU owners.
"standard_nd96ams_a100_v4": true,
"standard_nd96ams_v4": true,
// H100.
"standard_nd46s_h100_v5": true,
"standard_nd48s_h100_v5": true,
"standard_nd50s_h100_v5": true,
"standard_nd92is_h100_v5": true,
"standard_nd96is_h100_v5": true,
"standard_nd100is_h100_v5": true,
"standard_nd92isr_h100_v5": true,
"standard_nd96isr_h100_v5": true,
"standard_nd100isr_h100_v5": true,
// A100 oddballs.
"standard_nc24ads_a100_v4": false, // NCads_v4 will fail to start fabricmanager.
"standard_nc48ads_a100_v4": false,
"standard_nc96ads_a100_v4": false,
}
123 changes: 123 additions & 0 deletions pkg/agent/datamodel/gpu_components.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package datamodel

import (
"encoding/json"
"fmt"
"strings"

"github.com/Azure/agentbaker/parts"
)

const Nvidia470CudaDriverVersion = "cuda-470.82.01"

//nolint:gochecknoglobals
var (
NvidiaCudaDriverVersion string
NvidiaGridDriverVersion string
AKSGPUCudaVersionSuffix string
AKSGPUGridVersionSuffix string
)

type gpuVersion struct {
RenovateTag string `json:"renovateTag"`
LatestVersion string `json:"latestVersion"`
}

type gpuContainerImage struct {
DownloadURL string `json:"downloadURL"`
GPUVersion gpuVersion `json:"gpuVersion"`
}

type componentsConfig struct {
GPUContainerImages []gpuContainerImage `json:"GPUContainerImages"`
}

func LoadConfig() error {
// Read the embedded components.json file
data, err := parts.Templates.ReadFile("linux/cloud-init/artifacts/components.json")
if err != nil {
return fmt.Errorf("failed to read components.json: %w", err)
}

var config componentsConfig
if err := json.Unmarshal(data, &config); err != nil {
return fmt.Errorf("failed to unmarshal components.json: %w", err)
}

const driverIndex = 0
const suffixIndex = 1
const expectedLength = 2

for _, image := range config.GPUContainerImages {
parts := strings.Split(image.GPUVersion.LatestVersion, "-")
if len(parts) != expectedLength {
continue
}
version, suffix := parts[driverIndex], parts[suffixIndex]

if strings.Contains(image.DownloadURL, "aks-gpu-cuda") {
NvidiaCudaDriverVersion = version
AKSGPUCudaVersionSuffix = suffix
} else if strings.Contains(image.DownloadURL, "aks-gpu-grid") {
NvidiaGridDriverVersion = version
AKSGPUGridVersionSuffix = suffix
}
}
return nil
}

//nolint:gochecknoinits
func init() {
if err := LoadConfig(); err != nil {
panic(fmt.Sprintf("Failed to load configuration: %v", err))
}
}

/* convergedGPUDriverSizes : these sizes use a "converged" driver to support both cuda/grid workloads.
how do you figure this out? ask HPC or find out by trial and error.
installing vanilla cuda drivers will fail to install with opaque errors.
nvidia-bug-report.sh may be helpful, but usually it tells you the pci card id is incompatible.
That sends me to HPC folks.
see https://github.com/Azure/azhpc-extensions/blob/daaefd78df6f27012caf30f3b54c3bd6dc437652/NvidiaGPU/resources.json
*/
//nolint:gochecknoglobals
var ConvergedGPUDriverSizes = map[string]bool{
"standard_nv6ads_a10_v5": true,
"standard_nv12ads_a10_v5": true,
"standard_nv18ads_a10_v5": true,
"standard_nv36ads_a10_v5": true,
"standard_nv72ads_a10_v5": true,
"standard_nv36adms_a10_v5": true,
"standard_nc8ads_a10_v4": true,
"standard_nc16ads_a10_v4": true,
"standard_nc32ads_a10_v4": true,
}

//nolint:gochecknoglobals
var FabricManagerGPUSizes = map[string]bool{
// A100
"standard_nd96asr_v4": true,
"standard_nd112asr_a100_v4": true,
"standard_nd120asr_a100_v4": true,
"standard_nd96amsr_a100_v4": true,
"standard_nd112amsr_a100_v4": true,
"standard_nd120amsr_a100_v4": true,
// TODO(ace): one of these is probably dupe...
// confirm with HPC/SKU owners.
"standard_nd96ams_a100_v4": true,
"standard_nd96ams_v4": true,
// H100.
"standard_nd46s_h100_v5": true,
"standard_nd48s_h100_v5": true,
"standard_nd50s_h100_v5": true,
"standard_nd92is_h100_v5": true,
"standard_nd96is_h100_v5": true,
"standard_nd100is_h100_v5": true,
"standard_nd92isr_h100_v5": true,
"standard_nd96isr_h100_v5": true,
"standard_nd100isr_h100_v5": true,
// A100 oddballs.
"standard_nc24ads_a100_v4": false, // NCads_v4 will fail to start fabricmanager.
"standard_nc48ads_a100_v4": false,
"standard_nc96ads_a100_v4": false,
}
50 changes: 50 additions & 0 deletions pkg/agent/datamodel/gpu_components_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// pkg/agent/datamodel/config_test.go
package datamodel

import (
"regexp"
"testing"
)

func TestLoadConfig(t *testing.T) {
// The configuration is loaded during package initialization
if NvidiaCudaDriverVersion == "" {
t.Error("NvidiaCudaDriverVersion is empty")
}
if NvidiaGridDriverVersion == "" {
t.Error("NvidiaGridDriverVersion is empty")
}

if AKSGPUCudaVersionSuffix == "" {
t.Error("NvidiaCudaDriverVersion is empty")
}

if AKSGPUGridVersionSuffix == "" {
t.Error(("AKSGPUGridVersionSuffix is empty"))
}

// Define regular expressions for expected formats
versionRegex := `^\d+\.\d+\.\d+$` // match version strings in a format like "X.Y.Z", where each of X, Y, and Z are numbers. e.g., "550.90.12"
suffixRegex := `^\d{14}$` // match a string of exactly 14 digits, which can represent a timestamp e.g., "20241021235610"

// Compile the regular expressions
versionPattern := regexp.MustCompile(versionRegex)
suffixPattern := regexp.MustCompile(suffixRegex)

// Test NvidiaCudaDriverVersion and other variables' format
if !versionPattern.MatchString(NvidiaCudaDriverVersion) {
t.Errorf("NvidiaCudaDriverVersion '%s' does not match expected format", NvidiaCudaDriverVersion)
}

if !versionPattern.MatchString(NvidiaGridDriverVersion) {
t.Errorf("NvidiaGridDriverVersion '%s' does not match expected format", NvidiaGridDriverVersion)
}

if !suffixPattern.MatchString(AKSGPUCudaVersionSuffix) {
t.Errorf("AKSGPUCudaVersionSuffix '%s' does not match expected format", AKSGPUCudaVersionSuffix)
}

if !suffixPattern.MatchString(AKSGPUGridVersionSuffix) {
t.Errorf("AKSGPUGridVersionSuffix '%s' does not match expected format", AKSGPUGridVersionSuffix)
}
}
Loading

0 comments on commit 3d2ee13

Please sign in to comment.