From e341ae510517646f3aae798b31ae5470c52b325f Mon Sep 17 00:00:00 2001 From: wenhug <50309350+wenhug@users.noreply.github.com> Date: Sat, 26 Oct 2024 11:23:27 -0700 Subject: [PATCH 1/3] chore: add binaries for 1.27.100,1.28.15, 1.29.10, 1.30.6, 1.31.2 (#5144) Co-authored-by: Wen Huang --- .../cloud-init/artifacts/components.json | 40 +++++++++---------- .../generate-windows-vhd-configuration.ps1 | 14 +++---- .../packer/test/linux-vhd-content-test.sh | 3 ++ 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/components.json b/parts/linux/cloud-init/artifacts/components.json index 89147fd1c5a..4460a39c120 100644 --- a/parts/linux/cloud-init/artifacts/components.json +++ b/parts/linux/cloud-init/artifacts/components.json @@ -532,28 +532,28 @@ "multiArchVersionsV2": [ { "renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy", - "latestVersion": "v1.27.16", - "previousLatestVersion": "v1.27.15" + "latestVersion": "v1.27.100-akslts", + "previousLatestVersion": "v1.27.16" }, { "renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy", - "latestVersion": "v1.28.14", - "previousLatestVersion": "v1.28.13" + "latestVersion": "v1.28.15", + "previousLatestVersion": "v1.28.14" }, { "renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy", - "latestVersion": "v1.29.9", - "previousLatestVersion": "v1.29.8" + "latestVersion": "v1.29.10", + "previousLatestVersion": "v1.29.9" }, { "renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy", - "latestVersion": "v1.30.5", - "previousLatestVersion": "v1.30.4" + "latestVersion": "v1.30.6", + "previousLatestVersion": "v1.30.5" }, { "renovateTag": "registry=https://mcr.microsoft.com, name=oss/kubernetes/kube-proxy", - "latestVersion": "v1.31.1", - "previousLatestVersion": "v1.31.0" + "latestVersion": "v1.31.2", + "previousLatestVersion": "v1.31.1" } ] } @@ -777,32 +777,32 @@ { "k8sVersion": "1.27", "renovateTag": "", - "latestVersion": "1.27.16", - "previousLatestVersion": "1.27.15" + "latestVersion": "1.27.100-akslts", + "previousLatestVersion": "1.27.16" }, { "k8sVersion": "1.28", "renovateTag": "", - "latestVersion": "1.28.14", - "previousLatestVersion": "1.28.13" + "latestVersion": "1.28.15", + "previousLatestVersion": "1.28.14" }, { "k8sVersion": "1.29", "renovateTag": "", - "latestVersion": "1.29.9", - "previousLatestVersion": "1.29.8" + "latestVersion": "1.29.10", + "previousLatestVersion": "1.29.9" }, { "k8sVersion": "1.30", "renovateTag": "", - "latestVersion": "1.30.5", - "previousLatestVersion": "1.30.4" + "latestVersion": "1.30.6", + "previousLatestVersion": "1.30.5" }, { "k8sVersion": "1.31", "renovateTag": "", - "latestVersion": "1.31.1", - "previousLatestVersion": "1.31.0" + "latestVersion": "1.31.2", + "previousLatestVersion": "1.31.1" } ], "downloadURL": "https://acs-mirror.azureedge.net/kubernetes/v${version}/binaries/kubernetes-node-linux-${CPU_ARCH}.tar.gz" diff --git a/vhdbuilder/packer/generate-windows-vhd-configuration.ps1 b/vhdbuilder/packer/generate-windows-vhd-configuration.ps1 index 73e5eb09333..64d49f90dd9 100644 --- a/vhdbuilder/packer/generate-windows-vhd-configuration.ps1 +++ b/vhdbuilder/packer/generate-windows-vhd-configuration.ps1 @@ -191,22 +191,20 @@ $global:map = @{ # 2. Keep 1.18.10, 1.18.14, 1.18.17, 1.18.18 # 3. Keep v1.18.17-hotfix.20210322 when adding v1.18.17-hotfix.20210505 "c:\akse-cache\win-k8s\" = @( - "https://acs-mirror.azureedge.net/kubernetes/v1.27.14-hotfix.20240712/windowszip/v1.27.14-hotfix.20240712-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.27.15-hotfix.20240712/windowszip/v1.27.15-hotfix.20240712-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.27.16/windowszip/v1.27.16-1int.zip", - "https://acs-mirror.azureedge.net/kubernetes/v1.28.11-hotfix.20240712/windowszip/v1.28.11-hotfix.20240712-1int.zip", - "https://acs-mirror.azureedge.net/kubernetes/v1.28.12/windowszip/v1.28.12-1int.zip", + "https://acs-mirror.azureedge.net/kubernetes/v1.27.100-akslts/windowszip/v1.27.100-akslts-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.28.13/windowszip/v1.28.13-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.28.14/windowszip/v1.28.14-1int.zip", - "https://acs-mirror.azureedge.net/kubernetes/v1.29.6-hotfix.20240712/windowszip/v1.29.6-hotfix.20240712-1int.zip", - "https://acs-mirror.azureedge.net/kubernetes/v1.29.7/windowszip/v1.29.7-1int.zip", + "https://acs-mirror.azureedge.net/kubernetes/v1.28.15/windowszip/v1.28.15-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.29.8/windowszip/v1.29.8-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.29.9/windowszip/v1.29.9-1int.zip", - "https://acs-mirror.azureedge.net/kubernetes/v1.30.2-hotfix.20240712/windowszip/v1.30.2-hotfix.20240712-1int.zip", - "https://acs-mirror.azureedge.net/kubernetes/v1.30.3/windowszip/v1.30.3-1int.zip", + "https://acs-mirror.azureedge.net/kubernetes/v1.29.10/windowszip/v1.29.10-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.30.4/windowszip/v1.30.4-1int.zip", "https://acs-mirror.azureedge.net/kubernetes/v1.30.5/windowszip/v1.30.5-1int.zip", - "https://acs-mirror.azureedge.net/kubernetes/v1.31.1/windowszip/v1.31.1-1int.zip" + "https://acs-mirror.azureedge.net/kubernetes/v1.30.6/windowszip/v1.30.6-1int.zip", + "https://acs-mirror.azureedge.net/kubernetes/v1.31.1/windowszip/v1.31.1-1int.zip", + "https://acs-mirror.azureedge.net/kubernetes/v1.31.2/windowszip/v1.31.2-1int.zip" ); "c:\akse-cache\win-vnet-cni\" = @( # Azure CNI v1 (legacy) diff --git a/vhdbuilder/packer/test/linux-vhd-content-test.sh b/vhdbuilder/packer/test/linux-vhd-content-test.sh index b4337c0a94c..c12e03e774d 100644 --- a/vhdbuilder/packer/test/linux-vhd-content-test.sh +++ b/vhdbuilder/packer/test/linux-vhd-content-test.sh @@ -400,6 +400,9 @@ testKubeBinariesPresent() { patchedK8sVersion=$(echo ${patchedK8sVersion} | cut -d"." -f1,2,3) fi k8sVersion=$(echo ${patchedK8sVersion} | cut -d"_" -f1 | cut -d"-" -f1 | cut -d"." -f1,2,3) + if grep -iq akslts <<<${patchedK8sVersion}; then + k8sVersion="$k8sVersion-akslts" + fi kubeletDownloadLocation="$binaryDir/kubelet-$k8sVersion" kubectlDownloadLocation="$binaryDir/kubectl-$k8sVersion" kubeletInstallLocation="/usr/local/bin/kubelet" From 730359ab5ad2e015da4e963fa4de648aa63e860e Mon Sep 17 00:00:00 2001 From: lilypan26 Date: Mon, 28 Oct 2024 09:50:45 -0700 Subject: [PATCH 2/3] chore: add node-bootstrapper to make test target (#5165) Co-authored-by: Lily Pan Co-authored-by: Devinwong --- .github/workflows/go-test.yml | 2 +- Makefile | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/go-test.yml b/.github/workflows/go-test.yml index 7cf0b56c141..01ad22c25c2 100644 --- a/.github/workflows/go-test.yml +++ b/.github/workflows/go-test.yml @@ -11,4 +11,4 @@ jobs: go-version: '1.22' - run: | make test - name: Run unit tests for go code in pkg/agent + name: Run unit tests for go code in the repository diff --git a/Makefile b/Makefile index 0c3368acb08..b01d90a6781 100644 --- a/Makefile +++ b/Makefile @@ -180,9 +180,12 @@ endif ginkgoBuild: generate make -C ./test/e2e ginkgo-build -test: +test: test-node-bootstrapper go test ./... +test-node-bootstrapper: + pushd node-bootstrapper && go test ./... && popd + .PHONY: test-style test-style: validate-go validate-shell validate-copyright-headers From 11c0b8bcf0581527706827eb3ea62a57da1403d6 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan <35557827+ganeshkumarashok@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:37:22 -0700 Subject: [PATCH 3/3] Use GPU versions in components.json for CSE (#5162) --- pkg/agent/baker.go | 4 +- pkg/agent/baker_test.go | 8 +- pkg/agent/datamodel/const.go | 62 ----------- pkg/agent/datamodel/gpu_components.go | 123 +++++++++++++++++++++ pkg/agent/datamodel/gpu_components_test.go | 50 +++++++++ 5 files changed, 179 insertions(+), 68 deletions(-) create mode 100644 pkg/agent/datamodel/gpu_components.go create mode 100644 pkg/agent/datamodel/gpu_components_test.go diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 043e1b82fa0..7c774c3808d 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1040,12 +1040,12 @@ func getPortRangeEndValue(portRange string) int { // NVv3 is untested on AKS, NVv4 is AMD so n/a, and NVv2 no longer seems to exist (?). func GetGPUDriverVersion(size string) string { if useGridDrivers(size) { - return datamodel.Nvidia535GridDriverVersion + return datamodel.NvidiaGridDriverVersion } if isStandardNCv1(size) { return datamodel.Nvidia470CudaDriverVersion } - return datamodel.Nvidia550CudaDriverVersion + return datamodel.NvidiaCudaDriverVersion } func isStandardNCv1(size string) bool { diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index 459c6bb4822..0f36b1b7273 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -2141,15 +2141,15 @@ var _ = Describe("GetGPUDriverVersion", func() { Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.Nvidia470CudaDriverVersion)) }) It("should use cuda with nc v3", func() { - Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.Nvidia550CudaDriverVersion)) + Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaDriverVersion)) }) It("should use grid with nv v5", func() { - Expect(GetGPUDriverVersion("standard_nv6ads_a10_v5")).To(Equal(datamodel.Nvidia535GridDriverVersion)) - Expect(GetGPUDriverVersion("Standard_nv36adms_A10_V5")).To(Equal(datamodel.Nvidia535GridDriverVersion)) + Expect(GetGPUDriverVersion("standard_nv6ads_a10_v5")).To(Equal(datamodel.NvidiaGridDriverVersion)) + Expect(GetGPUDriverVersion("Standard_nv36adms_A10_V5")).To(Equal(datamodel.NvidiaGridDriverVersion)) }) // NV V1 SKUs were retired in September 2023, leaving this test just for safety It("should use cuda with nv v1", func() { - Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.Nvidia550CudaDriverVersion)) + Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.NvidiaCudaDriverVersion)) }) }) diff --git a/pkg/agent/datamodel/const.go b/pkg/agent/datamodel/const.go index 4fc4fb39808..5ef482c36a0 100644 --- a/pkg/agent/datamodel/const.go +++ b/pkg/agent/datamodel/const.go @@ -131,65 +131,3 @@ const ( EnableIPv6Only = "EnableIPv6Only" EnableWinDSR = "EnableWinDSR" ) - -const ( - Nvidia470CudaDriverVersion = "cuda-470.82.01" - Nvidia550CudaDriverVersion = "550.90.12" - Nvidia535GridDriverVersion = "535.161.08" -) - -// These SHAs will change once we update aks-gpu images in aks-gpu repository. We do that fairly rarely at this time. -// So for now these will be kept here like this. -const ( - AKSGPUCudaVersionSuffix = "20241021235610" - AKSGPUGridVersionSuffix = "20241021235607" -) - -/* convergedGPUDriverSizes : these sizes use a "converged" driver to support both cuda/grid workloads. -how do you figure this out? ask HPC or find out by trial and error. -installing vanilla cuda drivers will fail to install with opaque errors. -nvidia-bug-report.sh may be helpful, but usually it tells you the pci card id is incompatible. -That sends me to HPC folks. -see https://github.com/Azure/azhpc-extensions/blob/daaefd78df6f27012caf30f3b54c3bd6dc437652/NvidiaGPU/resources.json -*/ -//nolint:gochecknoglobals -var ConvergedGPUDriverSizes = map[string]bool{ - "standard_nv6ads_a10_v5": true, - "standard_nv12ads_a10_v5": true, - "standard_nv18ads_a10_v5": true, - "standard_nv36ads_a10_v5": true, - "standard_nv72ads_a10_v5": true, - "standard_nv36adms_a10_v5": true, - "standard_nc8ads_a10_v4": true, - "standard_nc16ads_a10_v4": true, - "standard_nc32ads_a10_v4": true, -} - -//nolint:gochecknoglobals -var FabricManagerGPUSizes = map[string]bool{ - // A100 - "standard_nd96asr_v4": true, - "standard_nd112asr_a100_v4": true, - "standard_nd120asr_a100_v4": true, - "standard_nd96amsr_a100_v4": true, - "standard_nd112amsr_a100_v4": true, - "standard_nd120amsr_a100_v4": true, - // TODO(ace): one of these is probably dupe... - // confirm with HPC/SKU owners. - "standard_nd96ams_a100_v4": true, - "standard_nd96ams_v4": true, - // H100. - "standard_nd46s_h100_v5": true, - "standard_nd48s_h100_v5": true, - "standard_nd50s_h100_v5": true, - "standard_nd92is_h100_v5": true, - "standard_nd96is_h100_v5": true, - "standard_nd100is_h100_v5": true, - "standard_nd92isr_h100_v5": true, - "standard_nd96isr_h100_v5": true, - "standard_nd100isr_h100_v5": true, - // A100 oddballs. - "standard_nc24ads_a100_v4": false, // NCads_v4 will fail to start fabricmanager. - "standard_nc48ads_a100_v4": false, - "standard_nc96ads_a100_v4": false, -} diff --git a/pkg/agent/datamodel/gpu_components.go b/pkg/agent/datamodel/gpu_components.go new file mode 100644 index 00000000000..a79a7e7b8c3 --- /dev/null +++ b/pkg/agent/datamodel/gpu_components.go @@ -0,0 +1,123 @@ +package datamodel + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/Azure/agentbaker/parts" +) + +const Nvidia470CudaDriverVersion = "cuda-470.82.01" + +//nolint:gochecknoglobals +var ( + NvidiaCudaDriverVersion string + NvidiaGridDriverVersion string + AKSGPUCudaVersionSuffix string + AKSGPUGridVersionSuffix string +) + +type gpuVersion struct { + RenovateTag string `json:"renovateTag"` + LatestVersion string `json:"latestVersion"` +} + +type gpuContainerImage struct { + DownloadURL string `json:"downloadURL"` + GPUVersion gpuVersion `json:"gpuVersion"` +} + +type componentsConfig struct { + GPUContainerImages []gpuContainerImage `json:"GPUContainerImages"` +} + +func LoadConfig() error { + // Read the embedded components.json file + data, err := parts.Templates.ReadFile("linux/cloud-init/artifacts/components.json") + if err != nil { + return fmt.Errorf("failed to read components.json: %w", err) + } + + var config componentsConfig + if err := json.Unmarshal(data, &config); err != nil { + return fmt.Errorf("failed to unmarshal components.json: %w", err) + } + + const driverIndex = 0 + const suffixIndex = 1 + const expectedLength = 2 + + for _, image := range config.GPUContainerImages { + parts := strings.Split(image.GPUVersion.LatestVersion, "-") + if len(parts) != expectedLength { + continue + } + version, suffix := parts[driverIndex], parts[suffixIndex] + + if strings.Contains(image.DownloadURL, "aks-gpu-cuda") { + NvidiaCudaDriverVersion = version + AKSGPUCudaVersionSuffix = suffix + } else if strings.Contains(image.DownloadURL, "aks-gpu-grid") { + NvidiaGridDriverVersion = version + AKSGPUGridVersionSuffix = suffix + } + } + return nil +} + +//nolint:gochecknoinits +func init() { + if err := LoadConfig(); err != nil { + panic(fmt.Sprintf("Failed to load configuration: %v", err)) + } +} + +/* convergedGPUDriverSizes : these sizes use a "converged" driver to support both cuda/grid workloads. +how do you figure this out? ask HPC or find out by trial and error. +installing vanilla cuda drivers will fail to install with opaque errors. +nvidia-bug-report.sh may be helpful, but usually it tells you the pci card id is incompatible. +That sends me to HPC folks. +see https://github.com/Azure/azhpc-extensions/blob/daaefd78df6f27012caf30f3b54c3bd6dc437652/NvidiaGPU/resources.json +*/ +//nolint:gochecknoglobals +var ConvergedGPUDriverSizes = map[string]bool{ + "standard_nv6ads_a10_v5": true, + "standard_nv12ads_a10_v5": true, + "standard_nv18ads_a10_v5": true, + "standard_nv36ads_a10_v5": true, + "standard_nv72ads_a10_v5": true, + "standard_nv36adms_a10_v5": true, + "standard_nc8ads_a10_v4": true, + "standard_nc16ads_a10_v4": true, + "standard_nc32ads_a10_v4": true, +} + +//nolint:gochecknoglobals +var FabricManagerGPUSizes = map[string]bool{ + // A100 + "standard_nd96asr_v4": true, + "standard_nd112asr_a100_v4": true, + "standard_nd120asr_a100_v4": true, + "standard_nd96amsr_a100_v4": true, + "standard_nd112amsr_a100_v4": true, + "standard_nd120amsr_a100_v4": true, + // TODO(ace): one of these is probably dupe... + // confirm with HPC/SKU owners. + "standard_nd96ams_a100_v4": true, + "standard_nd96ams_v4": true, + // H100. + "standard_nd46s_h100_v5": true, + "standard_nd48s_h100_v5": true, + "standard_nd50s_h100_v5": true, + "standard_nd92is_h100_v5": true, + "standard_nd96is_h100_v5": true, + "standard_nd100is_h100_v5": true, + "standard_nd92isr_h100_v5": true, + "standard_nd96isr_h100_v5": true, + "standard_nd100isr_h100_v5": true, + // A100 oddballs. + "standard_nc24ads_a100_v4": false, // NCads_v4 will fail to start fabricmanager. + "standard_nc48ads_a100_v4": false, + "standard_nc96ads_a100_v4": false, +} diff --git a/pkg/agent/datamodel/gpu_components_test.go b/pkg/agent/datamodel/gpu_components_test.go new file mode 100644 index 00000000000..ca0b06cb357 --- /dev/null +++ b/pkg/agent/datamodel/gpu_components_test.go @@ -0,0 +1,50 @@ +// pkg/agent/datamodel/config_test.go +package datamodel + +import ( + "regexp" + "testing" +) + +func TestLoadConfig(t *testing.T) { + // The configuration is loaded during package initialization + if NvidiaCudaDriverVersion == "" { + t.Error("NvidiaCudaDriverVersion is empty") + } + if NvidiaGridDriverVersion == "" { + t.Error("NvidiaGridDriverVersion is empty") + } + + if AKSGPUCudaVersionSuffix == "" { + t.Error("NvidiaCudaDriverVersion is empty") + } + + if AKSGPUGridVersionSuffix == "" { + t.Error(("AKSGPUGridVersionSuffix is empty")) + } + + // Define regular expressions for expected formats + versionRegex := `^\d+\.\d+\.\d+$` // match version strings in a format like "X.Y.Z", where each of X, Y, and Z are numbers. e.g., "550.90.12" + suffixRegex := `^\d{14}$` // match a string of exactly 14 digits, which can represent a timestamp e.g., "20241021235610" + + // Compile the regular expressions + versionPattern := regexp.MustCompile(versionRegex) + suffixPattern := regexp.MustCompile(suffixRegex) + + // Test NvidiaCudaDriverVersion and other variables' format + if !versionPattern.MatchString(NvidiaCudaDriverVersion) { + t.Errorf("NvidiaCudaDriverVersion '%s' does not match expected format", NvidiaCudaDriverVersion) + } + + if !versionPattern.MatchString(NvidiaGridDriverVersion) { + t.Errorf("NvidiaGridDriverVersion '%s' does not match expected format", NvidiaGridDriverVersion) + } + + if !suffixPattern.MatchString(AKSGPUCudaVersionSuffix) { + t.Errorf("AKSGPUCudaVersionSuffix '%s' does not match expected format", AKSGPUCudaVersionSuffix) + } + + if !suffixPattern.MatchString(AKSGPUGridVersionSuffix) { + t.Errorf("AKSGPUGridVersionSuffix '%s' does not match expected format", AKSGPUGridVersionSuffix) + } +}