From fa5b416264c8ea7cc389603833c9f131a2331f97 Mon Sep 17 00:00:00 2001 From: Ilya Alekseyev Date: Tue, 13 Aug 2024 14:43:57 +0000 Subject: [PATCH] Initial GPU support implementation --- pkg/api/v1alpha1/nutanixmachineconfig.go | 28 + .../v1alpha1/nutanixmachineconfig_types.go | 4 + pkg/constants/constants.go | 2 + pkg/providers/nutanix/client.go | 1 + pkg/providers/nutanix/config/md-template.yaml | 12 + pkg/providers/nutanix/mocks/client.go | 15 + pkg/providers/nutanix/provider_test.go | 1 + pkg/providers/nutanix/template.go | 4 + pkg/providers/nutanix/template_test.go | 46 ++ .../nutanix/testdata/eksa-cluster-gpus.yaml | 75 ++ .../testdata/expected_results_gpus.yaml | 611 +++++++++++++++++ .../testdata/expected_results_gpus_md.yaml | 86 +++ pkg/providers/nutanix/validator.go | 278 +++++++- pkg/providers/nutanix/validator_test.go | 644 ++++++++++++++++++ 14 files changed, 1800 insertions(+), 7 deletions(-) create mode 100644 pkg/providers/nutanix/testdata/eksa-cluster-gpus.yaml create mode 100644 pkg/providers/nutanix/testdata/expected_results_gpus.yaml create mode 100644 pkg/providers/nutanix/testdata/expected_results_gpus_md.yaml diff --git a/pkg/api/v1alpha1/nutanixmachineconfig.go b/pkg/api/v1alpha1/nutanixmachineconfig.go index 66b59aaca465..00e09101a05f 100644 --- a/pkg/api/v1alpha1/nutanixmachineconfig.go +++ b/pkg/api/v1alpha1/nutanixmachineconfig.go @@ -10,10 +10,17 @@ import ( // NutanixIdentifierType is an enumeration of different resource identifier types. type NutanixIdentifierType string +// NutanixGPUIdentifierType is an enumeration of different GPU identifier types. +type NutanixGPUIdentifierType string + func (c NutanixIdentifierType) String() string { return string(c) } +func (c NutanixGPUIdentifierType) String() string { + return string(c) +} + const ( // NutanixMachineConfigKind is the kind for a NutanixMachineConfig. NutanixMachineConfigKind = "NutanixMachineConfig" @@ -23,6 +30,11 @@ const ( // NutanixIdentifierName is a resource identifier identifying the object by Name. NutanixIdentifierName NutanixIdentifierType = "name" + // NutanixGPUIdentifierDeviceID is a GPU identifier identifying the object by DeviceID. + NutanixGPUIdentifierDeviceID NutanixGPUIdentifierType = "deviceID" + // NutanixGPUIdentifierName is a GPU identifier identifying the object by Name. + NutanixGPUIdentifierName NutanixGPUIdentifierType = "name" + defaultNutanixOSFamily = Ubuntu defaultNutanixSystemDiskSizeGi = "40Gi" defaultNutanixMemorySizeGi = "4Gi" @@ -62,6 +74,22 @@ type NutanixCategoryIdentifier struct { Value string `json:"value,omitempty"` } +// NutanixGPUIdentifier holds VM GPU device configuration. +type NutanixGPUIdentifier struct { + // deviceID is the device ID of the GPU device. + // +optional + DeviceID *int64 `json:"deviceID,omitempty"` + + // vendorID is the vendor ID of the GPU device. + // +optional + Name string `json:"name,omitempty"` + + // type is the type of the GPU device. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum:=deviceID;name + Type NutanixGPUIdentifierType `json:"type"` +} + // NutanixMachineConfigGenerateOpt is a functional option that can be passed to NewNutanixMachineConfigGenerate to // customize the generated machine config // diff --git a/pkg/api/v1alpha1/nutanixmachineconfig_types.go b/pkg/api/v1alpha1/nutanixmachineconfig_types.go index 3a696f3b3fc2..d63dcb8939df 100644 --- a/pkg/api/v1alpha1/nutanixmachineconfig_types.go +++ b/pkg/api/v1alpha1/nutanixmachineconfig_types.go @@ -59,6 +59,10 @@ type NutanixMachineConfigSpec struct { // Categories must be created in Prism Central before they can be used. // +kubebuilder:validation:Optional AdditionalCategories []NutanixCategoryIdentifier `json:"additionalCategories,omitempty"` + + // List of GPU devices that should be added to the VMs. + // +kubebuilder:validation:Optional + GPUs []NutanixGPUIdentifier `json:"gpus,omitempty"` } // SetDefaults sets defaults to NutanixMachineConfig if user has not provided. diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 3eb41b91e27f..d71726d4ea7e 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -90,6 +90,8 @@ const ( ConfigMapKind = "ConfigMap" ClusterResourceSetKind = "ClusterResourceSet" + NutanixMachineConfigKind = "NutanixMachineConfig" + BottlerocketDefaultUser = "ec2-user" UbuntuDefaultUser = "capv" diff --git a/pkg/providers/nutanix/client.go b/pkg/providers/nutanix/client.go index 4de7b5e52ffb..286fd66b7cc8 100644 --- a/pkg/providers/nutanix/client.go +++ b/pkg/providers/nutanix/client.go @@ -8,6 +8,7 @@ import ( type Client interface { GetSubnet(ctx context.Context, uuid string) (*v3.SubnetIntentResponse, error) + ListAllHost(ctx context.Context) (*v3.HostListResponse, error) ListSubnet(ctx context.Context, getEntitiesRequest *v3.DSMetadata) (*v3.SubnetListIntentResponse, error) GetImage(ctx context.Context, uuid string) (*v3.ImageIntentResponse, error) ListImage(ctx context.Context, getEntitiesRequest *v3.DSMetadata) (*v3.ImageListIntentResponse, error) diff --git a/pkg/providers/nutanix/config/md-template.yaml b/pkg/providers/nutanix/config/md-template.yaml index 4d7717cc2ebc..8cb017650596 100644 --- a/pkg/providers/nutanix/config/md-template.yaml +++ b/pkg/providers/nutanix/config/md-template.yaml @@ -94,6 +94,18 @@ spec: value: "{{ .Value }}" {{- end }} {{- end }} +{{- if .GPUs }} + gpus: +{{- range .GPUs }} +{{- if (eq .Type "deviceID") }} + - type: deviceID + deviceID: {{ .DeviceID }} +{{- else if (eq .Type "name") }} + - type: name + name: "{{ .Name }}" +{{- end }} +{{- end }} +{{- end }} --- apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 kind: KubeadmConfigTemplate diff --git a/pkg/providers/nutanix/mocks/client.go b/pkg/providers/nutanix/mocks/client.go index 0e3cea843469..796e0ea723ec 100644 --- a/pkg/providers/nutanix/mocks/client.go +++ b/pkg/providers/nutanix/mocks/client.go @@ -155,6 +155,21 @@ func (mr *MockClientMockRecorder) GetSubnet(ctx, uuid interface{}) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetSubnet", reflect.TypeOf((*MockClient)(nil).GetSubnet), ctx, uuid) } +// ListAllHost mocks base method. +func (m *MockClient) ListAllHost(ctx context.Context) (*v3.HostListResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ListAllHost", ctx) + ret0, _ := ret[0].(*v3.HostListResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ListAllHost indicates an expected call of ListAllHost. +func (mr *MockClientMockRecorder) ListAllHost(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListAllHost", reflect.TypeOf((*MockClient)(nil).ListAllHost), ctx) +} + // ListCategories mocks base method. func (m *MockClient) ListCategories(ctx context.Context, getEntitiesRequest *v3.CategoryListMetadata) (*v3.CategoryKeyListResponse, error) { m.ctrl.T.Helper() diff --git a/pkg/providers/nutanix/provider_test.go b/pkg/providers/nutanix/provider_test.go index 5dd4bab83bbe..edd67448282c 100644 --- a/pkg/providers/nutanix/provider_test.go +++ b/pkg/providers/nutanix/provider_test.go @@ -418,6 +418,7 @@ func TestNutanixProviderSetupAndValidateCreate(t *testing.T) { }, } mockClient.EXPECT().ListImage(gomock.Any(), gomock.Any()).Return(images, nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil).AnyTimes() mockCertValidator := mockCrypto.NewMockTlsValidator(ctrl) mockCertValidator.EXPECT().ValidateCert(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil) mockCertValidator.EXPECT().ValidateCert(gomock.Any(), gomock.Any(), gomock.Any()).Return(errors.New("invalid cert")) diff --git a/pkg/providers/nutanix/template.go b/pkg/providers/nutanix/template.go index 8a52b0ecd093..507d2ec32c47 100644 --- a/pkg/providers/nutanix/template.go +++ b/pkg/providers/nutanix/template.go @@ -415,6 +415,10 @@ func buildTemplateMapMD(clusterSpec *cluster.Spec, workerNodeGroupMachineSpec v1 values["additionalCategories"] = workerNodeGroupMachineSpec.AdditionalCategories } + if len(workerNodeGroupMachineSpec.GPUs) > 0 { + values["GPUs"] = workerNodeGroupMachineSpec.GPUs + } + if workerNodeGroupConfiguration.KubeletConfiguration != nil { wnKubeletConfig := workerNodeGroupConfiguration.KubeletConfiguration.Object if _, ok := wnKubeletConfig["tlsCipherSuites"]; !ok { diff --git a/pkg/providers/nutanix/template_test.go b/pkg/providers/nutanix/template_test.go index 40a59ae613d7..71045105a515 100644 --- a/pkg/providers/nutanix/template_test.go +++ b/pkg/providers/nutanix/template_test.go @@ -726,6 +726,52 @@ func TestTemplateBuilderFailureDomains(t *testing.T) { } } +func TestTemplateBuilderGPUs(t *testing.T) { + for _, tc := range []struct { + Input string + Output string + OutputMD string + }{ + { + Input: "testdata/eksa-cluster-gpus.yaml", + Output: "testdata/expected_results_gpus.yaml", + OutputMD: "testdata/expected_results_gpus_md.yaml", + }, + } { + clusterSpec := test.NewFullClusterSpec(t, tc.Input) + + machineCfg := clusterSpec.NutanixMachineConfig(clusterSpec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name) + workerConfs := map[string]anywherev1.NutanixMachineConfigSpec{ + "eksa-unit-test": machineCfg.Spec, + } + + t.Setenv(constants.EksaNutanixUsernameKey, "admin") + t.Setenv(constants.EksaNutanixPasswordKey, "password") + creds := GetCredsFromEnv() + + bldr := NewNutanixTemplateBuilder(&clusterSpec.NutanixDatacenter.Spec, &machineCfg.Spec, &machineCfg.Spec, + workerConfs, creds, time.Now) + + cpSpec, err := bldr.GenerateCAPISpecControlPlane(clusterSpec) + assert.NoError(t, err) + assert.NotNil(t, cpSpec) + test.AssertContentToFile(t, string(cpSpec), tc.Output) + + workloadTemplateNames := map[string]string{ + "eksa-unit-test": "eksa-unit-test", + } + kubeadmconfigTemplateNames := map[string]string{ + "eksa-unit-test": "eksa-unit-test", + } + + data, err := bldr.GenerateCAPISpecWorkers(clusterSpec, workloadTemplateNames, kubeadmconfigTemplateNames) + + assert.NoError(t, err) + + test.AssertContentToFile(t, string(data), tc.OutputMD) + } +} + func minimalNutanixConfigSpec(t *testing.T) (*anywherev1.NutanixDatacenterConfig, *anywherev1.NutanixMachineConfig, map[string]anywherev1.NutanixMachineConfigSpec) { dcConf := &anywherev1.NutanixDatacenterConfig{} err := yaml.Unmarshal([]byte(nutanixDatacenterConfigSpec), dcConf) diff --git a/pkg/providers/nutanix/testdata/eksa-cluster-gpus.yaml b/pkg/providers/nutanix/testdata/eksa-cluster-gpus.yaml new file mode 100644 index 000000000000..bcd875ebfbbd --- /dev/null +++ b/pkg/providers/nutanix/testdata/eksa-cluster-gpus.yaml @@ -0,0 +1,75 @@ +apiVersion: anywhere.eks.amazonaws.com/v1alpha1 +kind: Cluster +metadata: + name: eksa-unit-test + namespace: default +spec: + kubernetesVersion: "1.19" + controlPlaneConfiguration: + name: eksa-unit-test + count: 3 + endpoint: + host: test-ip + machineGroupRef: + name: eksa-unit-test + kind: NutanixMachineConfig + workerNodeGroupConfigurations: + - count: 4 + name: eksa-unit-test + machineGroupRef: + name: eksa-unit-test + kind: NutanixMachineConfig + datacenterRef: + kind: NutanixDatacenterConfig + name: eksa-unit-test + clusterNetwork: + cni: "cilium" + pods: + cidrBlocks: + - 192.168.0.0/16 + services: + cidrBlocks: + - 10.96.0.0/12 +--- +apiVersion: anywhere.eks.amazonaws.com/v1alpha1 +kind: NutanixDatacenterConfig +metadata: + name: eksa-unit-test + namespace: default +spec: + endpoint: "prism.nutanix.com" + port: 9440 + credentialRef: + kind: Secret + name: "nutanix-credentials" +--- +apiVersion: anywhere.eks.amazonaws.com/v1alpha1 +kind: NutanixMachineConfig +metadata: + name: eksa-unit-test + namespace: default +spec: + vcpusPerSocket: 1 + vcpuSockets: 4 + memorySize: 8Gi + image: + type: "name" + name: "prism-image" + cluster: + type: "name" + name: "prism-cluster" + subnet: + type: "name" + name: "prism-subnet" + gpus: + - type: deviceID + deviceID: 8757 + - type: name + name: "Ampere 40" + systemDiskSize: 40Gi + osFamily: "ubuntu" + users: + - name: "mySshUsername" + sshAuthorizedKeys: + - "mySshAuthorizedKey" +--- diff --git a/pkg/providers/nutanix/testdata/expected_results_gpus.yaml b/pkg/providers/nutanix/testdata/expected_results_gpus.yaml new file mode 100644 index 000000000000..e9196e6783b2 --- /dev/null +++ b/pkg/providers/nutanix/testdata/expected_results_gpus.yaml @@ -0,0 +1,611 @@ +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: NutanixCluster +metadata: + name: "eksa-unit-test" + namespace: "eksa-system" +spec: + failureDomains: [] + prismCentral: + address: "prism.nutanix.com" + port: 9440 + insecure: false + credentialRef: + name: "capx-eksa-unit-test" + kind: Secret + controlPlaneEndpoint: + host: "test-ip" + port: 6443 +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + labels: + cluster.x-k8s.io/cluster-name: "eksa-unit-test" + name: "eksa-unit-test" + namespace: "eksa-system" +spec: + clusterNetwork: + services: + cidrBlocks: [10.96.0.0/12] + pods: + cidrBlocks: [192.168.0.0/16] + serviceDomain: "cluster.local" + controlPlaneRef: + apiVersion: controlplane.cluster.x-k8s.io/v1beta1 + kind: KubeadmControlPlane + name: "eksa-unit-test" + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: NutanixCluster + name: "eksa-unit-test" +--- +apiVersion: controlplane.cluster.x-k8s.io/v1beta1 +kind: KubeadmControlPlane +metadata: + name: "eksa-unit-test" + namespace: "eksa-system" +spec: + replicas: 3 + version: "v1.19.8-eks-1-19-4" + machineTemplate: + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: NutanixMachineTemplate + name: "" + kubeadmConfigSpec: + clusterConfiguration: + imageRepository: "public.ecr.aws/eks-distro/kubernetes" + apiServer: + certSANs: + - localhost + - 127.0.0.1 + - 0.0.0.0 + extraArgs: + cloud-provider: external + audit-policy-file: /etc/kubernetes/audit-policy.yaml + audit-log-path: /var/log/kubernetes/api-audit.log + audit-log-maxage: "30" + audit-log-maxbackup: "10" + audit-log-maxsize: "512" + extraVolumes: + - hostPath: /etc/kubernetes/audit-policy.yaml + mountPath: /etc/kubernetes/audit-policy.yaml + name: audit-policy + pathType: File + readOnly: true + - hostPath: /var/log/kubernetes + mountPath: /var/log/kubernetes + name: audit-log-dir + pathType: DirectoryOrCreate + readOnly: false + controllerManager: + extraArgs: + cloud-provider: external + enable-hostpath-provisioner: "true" + dns: + imageRepository: public.ecr.aws/eks-distro/coredns + imageTag: v1.8.0-eks-1-19-4 + etcd: + local: + imageRepository: public.ecr.aws/eks-distro/etcd-io + imageTag: v3.4.14-eks-1-19-4 + files: + - content: | + apiVersion: v1 + kind: Pod + metadata: + creationTimestamp: null + name: kube-vip + namespace: kube-system + spec: + containers: + - name: kube-vip + image: + imagePullPolicy: IfNotPresent + args: + - manager + env: + - name: vip_arp + value: "true" + - name: address + value: "test-ip" + - name: port + value: "6443" + - name: vip_cidr + value: "32" + - name: cp_enable + value: "true" + - name: cp_namespace + value: kube-system + - name: vip_ddns + value: "false" + - name: vip_leaderelection + value: "true" + - name: vip_leaseduration + value: "15" + - name: vip_renewdeadline + value: "10" + - name: vip_retryperiod + value: "2" + - name: svc_enable + value: "false" + - name: lb_enable + value: "false" + securityContext: + capabilities: + add: + - NET_ADMIN + - SYS_TIME + - NET_RAW + volumeMounts: + - mountPath: /etc/kubernetes/admin.conf + name: kubeconfig + resources: {} + hostNetwork: true + volumes: + - name: kubeconfig + hostPath: + type: FileOrCreate + path: /etc/kubernetes/admin.conf + status: {} + owner: root:root + path: /etc/kubernetes/manifests/kube-vip.yaml + - content: | + apiVersion: audit.k8s.io/v1beta1 + kind: Policy + rules: + # Log aws-auth configmap changes + - level: RequestResponse + namespaces: ["kube-system"] + verbs: ["update", "patch", "delete"] + resources: + - group: "" # core + resources: ["configmaps"] + resourceNames: ["aws-auth"] + omitStages: + - "RequestReceived" + # The following requests were manually identified as high-volume and low-risk, + # so drop them. + - level: None + users: ["system:kube-proxy"] + verbs: ["watch"] + resources: + - group: "" # core + resources: ["endpoints", "services", "services/status"] + - level: None + users: ["kubelet"] # legacy kubelet identity + verbs: ["get"] + resources: + - group: "" # core + resources: ["nodes", "nodes/status"] + - level: None + userGroups: ["system:nodes"] + verbs: ["get"] + resources: + - group: "" # core + resources: ["nodes", "nodes/status"] + - level: None + users: + - system:kube-controller-manager + - system:kube-scheduler + - system:serviceaccount:kube-system:endpoint-controller + verbs: ["get", "update"] + namespaces: ["kube-system"] + resources: + - group: "" # core + resources: ["endpoints"] + - level: None + users: ["system:apiserver"] + verbs: ["get"] + resources: + - group: "" # core + resources: ["namespaces", "namespaces/status", "namespaces/finalize"] + # Don't log HPA fetching metrics. + - level: None + users: + - system:kube-controller-manager + verbs: ["get", "list"] + resources: + - group: "metrics.k8s.io" + # Don't log these read-only URLs. + - level: None + nonResourceURLs: + - /healthz* + - /version + - /swagger* + # Don't log events requests. + - level: None + resources: + - group: "" # core + resources: ["events"] + # node and pod status calls from nodes are high-volume and can be large, don't log responses for expected updates from nodes + - level: Request + users: ["kubelet", "system:node-problem-detector", "system:serviceaccount:kube-system:node-problem-detector"] + verbs: ["update","patch"] + resources: + - group: "" # core + resources: ["nodes/status", "pods/status"] + omitStages: + - "RequestReceived" + - level: Request + userGroups: ["system:nodes"] + verbs: ["update","patch"] + resources: + - group: "" # core + resources: ["nodes/status", "pods/status"] + omitStages: + - "RequestReceived" + # deletecollection calls can be large, don't log responses for expected namespace deletions + - level: Request + users: ["system:serviceaccount:kube-system:namespace-controller"] + verbs: ["deletecollection"] + omitStages: + - "RequestReceived" + # Secrets, ConfigMaps, and TokenReviews can contain sensitive & binary data, + # so only log at the Metadata level. + - level: Metadata + resources: + - group: "" # core + resources: ["secrets", "configmaps"] + - group: authentication.k8s.io + resources: ["tokenreviews"] + omitStages: + - "RequestReceived" + - level: Request + resources: + - group: "" + resources: ["serviceaccounts/token"] + # Get repsonses can be large; skip them. + - level: Request + verbs: ["get", "list", "watch"] + resources: + - group: "" # core + - group: "admissionregistration.k8s.io" + - group: "apiextensions.k8s.io" + - group: "apiregistration.k8s.io" + - group: "apps" + - group: "authentication.k8s.io" + - group: "authorization.k8s.io" + - group: "autoscaling" + - group: "batch" + - group: "certificates.k8s.io" + - group: "extensions" + - group: "metrics.k8s.io" + - group: "networking.k8s.io" + - group: "policy" + - group: "rbac.authorization.k8s.io" + - group: "scheduling.k8s.io" + - group: "settings.k8s.io" + - group: "storage.k8s.io" + omitStages: + - "RequestReceived" + # Default level for known APIs + - level: RequestResponse + resources: + - group: "" # core + - group: "admissionregistration.k8s.io" + - group: "apiextensions.k8s.io" + - group: "apiregistration.k8s.io" + - group: "apps" + - group: "authentication.k8s.io" + - group: "authorization.k8s.io" + - group: "autoscaling" + - group: "batch" + - group: "certificates.k8s.io" + - group: "extensions" + - group: "metrics.k8s.io" + - group: "networking.k8s.io" + - group: "policy" + - group: "rbac.authorization.k8s.io" + - group: "scheduling.k8s.io" + - group: "settings.k8s.io" + - group: "storage.k8s.io" + omitStages: + - "RequestReceived" + # Default level for all other requests. + - level: Metadata + omitStages: + - "RequestReceived" + owner: root:root + path: /etc/kubernetes/audit-policy.yaml + initConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + # We have to pin the cgroupDriver to cgroupfs as kubeadm >=1.21 defaults to systemd + # kind will implement systemd support in: https://github.com/kubernetes-sigs/kind/issues/1726 + #cgroup-driver: cgroupfs + eviction-hard: nodefs.available<0%,nodefs.inodesFree<0%,imagefs.available<0% + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + joinConfiguration: + nodeRegistration: + criSocket: /var/run/containerd/containerd.sock + kubeletExtraArgs: + cloud-provider: external + read-only-port: "0" + anonymous-auth: "false" + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + name: "{{ ds.meta_data.hostname }}" + users: + - name: "mySshUsername" + lockPassword: false + sudo: ALL=(ALL) NOPASSWD:ALL + sshAuthorizedKeys: + - "mySshAuthorizedKey" + preKubeadmCommands: + - hostnamectl set-hostname "{{ ds.meta_data.hostname }}" + - echo "::1 ipv6-localhost ipv6-loopback" >/etc/hosts + - echo "127.0.0.1 localhost" >>/etc/hosts + - echo "127.0.0.1 {{ ds.meta_data.hostname }}" >> /etc/hosts + postKubeadmCommands: + - echo export KUBECONFIG=/etc/kubernetes/admin.conf >> /root/.bashrc + useExperimentalRetryJoin: true +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: NutanixMachineTemplate +metadata: + name: "" + namespace: "eksa-system" +spec: + template: + spec: + providerID: "nutanix://eksa-unit-test-m1" + vcpusPerSocket: 1 + vcpuSockets: 4 + memorySize: 8Gi + systemDiskSize: 40Gi + image: + type: name + name: "prism-image" + + cluster: + type: name + name: "prism-cluster" + subnet: + - type: name + name: "prism-subnet" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: eksa-unit-test-nutanix-ccm + namespace: "eksa-system" +data: + nutanix-ccm.yaml: | + --- + apiVersion: v1 + kind: ServiceAccount + metadata: + name: cloud-controller-manager + namespace: kube-system + --- + kind: ConfigMap + apiVersion: v1 + metadata: + name: nutanix-config + namespace: kube-system + data: + nutanix_config.json: |- + { + "prismCentral": { + "address": "prism.nutanix.com", + "port": 9440, + "insecure": false, + "credentialRef": { + "kind": "secret", + "name": "nutanix-creds", + "namespace": "kube-system" + } + }, + "enableCustomLabeling": false, + "topologyDiscovery": { + "type": "Prism" + } + } + --- + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + annotations: + rbac.authorization.kubernetes.io/autoupdate: "true" + name: system:cloud-controller-manager + rules: + - apiGroups: + - "" + resources: + - secrets + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update + - apiGroups: + - "" + resources: + - nodes + verbs: + - "*" + - apiGroups: + - "" + resources: + - nodes/status + verbs: + - patch + - apiGroups: + - "" + resources: + - serviceaccounts + verbs: + - create + - apiGroups: + - "" + resources: + - endpoints + verbs: + - create + - get + - list + - watch + - update + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + --- + kind: ClusterRoleBinding + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: system:cloud-controller-manager + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:cloud-controller-manager + subjects: + - kind: ServiceAccount + name: cloud-controller-manager + namespace: kube-system + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + labels: + k8s-app: nutanix-cloud-controller-manager + name: nutanix-cloud-controller-manager + namespace: kube-system + spec: + replicas: 1 + selector: + matchLabels: + k8s-app: nutanix-cloud-controller-manager + strategy: + type: Recreate + template: + metadata: + labels: + k8s-app: nutanix-cloud-controller-manager + spec: + hostNetwork: true + priorityClassName: system-cluster-critical + nodeSelector: + node-role.kubernetes.io/control-plane: "" + serviceAccountName: cloud-controller-manager + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + k8s-app: nutanix-cloud-controller-manager + topologyKey: kubernetes.io/hostname + dnsPolicy: Default + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Exists + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 120 + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 120 + - effect: NoSchedule + key: node.cloudprovider.kubernetes.io/uninitialized + operator: Exists + - effect: NoSchedule + key: node.kubernetes.io/not-ready + operator: Exists + containers: + - image: "" + imagePullPolicy: IfNotPresent + name: nutanix-cloud-controller-manager + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + args: + - "--leader-elect=true" + - "--cloud-config=/etc/cloud/nutanix_config.json" + resources: + requests: + cpu: 100m + memory: 50Mi + volumeMounts: + - mountPath: /etc/cloud + name: nutanix-config-volume + readOnly: true + volumes: + - name: nutanix-config-volume + configMap: + name: nutanix-config +--- +apiVersion: addons.cluster.x-k8s.io/v1beta1 +kind: ClusterResourceSet +metadata: + name: eksa-unit-test-nutanix-ccm-crs + namespace: "eksa-system" +spec: + clusterSelector: + matchLabels: + cluster.x-k8s.io/cluster-name: "eksa-unit-test" + resources: + - kind: ConfigMap + name: eksa-unit-test-nutanix-ccm + - kind: Secret + name: eksa-unit-test-nutanix-ccm-secret + strategy: Reconcile +--- +apiVersion: v1 +kind: Secret +metadata: + name: "eksa-unit-test-nutanix-ccm-secret" + namespace: "eksa-system" +stringData: + nutanix-ccm-secret.yaml: | + apiVersion: v1 + kind: Secret + metadata: + name: nutanix-creds + namespace: kube-system + stringData: + credentials: |- + [ + { + "type": "basic_auth", + "data": { + "prismCentral": { + "username": "admin", + "password": "password" + }, + "prismElements": null + } + } + ] +type: addons.cluster.x-k8s.io/resource-set diff --git a/pkg/providers/nutanix/testdata/expected_results_gpus_md.yaml b/pkg/providers/nutanix/testdata/expected_results_gpus_md.yaml new file mode 100644 index 000000000000..07e96a0f7901 --- /dev/null +++ b/pkg/providers/nutanix/testdata/expected_results_gpus_md.yaml @@ -0,0 +1,86 @@ +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineDeployment +metadata: + labels: + cluster.x-k8s.io/cluster-name: "eksa-unit-test" + name: "eksa-unit-test-eksa-unit-test" + namespace: "eksa-system" +spec: + clusterName: "eksa-unit-test" + replicas: 4 + selector: + matchLabels: {} + template: + metadata: + labels: + cluster.x-k8s.io/cluster-name: "eksa-unit-test" + spec: + bootstrap: + configRef: + apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 + kind: KubeadmConfigTemplate + name: "eksa-unit-test" + clusterName: "eksa-unit-test" + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: NutanixMachineTemplate + name: "eksa-unit-test" + version: "v1.19.8-eks-1-19-4" +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: NutanixMachineTemplate +metadata: + name: "eksa-unit-test" + namespace: "eksa-system" +spec: + template: + spec: + providerID: "nutanix://eksa-unit-test-m1" + vcpusPerSocket: 1 + vcpuSockets: 4 + memorySize: 8Gi + systemDiskSize: 40Gi + image: + type: name + name: "prism-image" + + cluster: + type: name + name: "prism-cluster" + subnet: + - type: name + name: "prism-subnet" + gpus: + - type: deviceID + deviceID: 8757 + - type: name + name: "Ampere 40" +--- +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: KubeadmConfigTemplate +metadata: + name: "eksa-unit-test" + namespace: "eksa-system" +spec: + template: + spec: + preKubeadmCommands: + - hostnamectl set-hostname "{{ ds.meta_data.hostname }}" + joinConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + # We have to pin the cgroupDriver to cgroupfs as kubeadm >=1.21 defaults to systemd + # kind will implement systemd support in: https://github.com/kubernetes-sigs/kind/issues/1726 + #cgroup-driver: cgroupfs + eviction-hard: nodefs.available<0%,nodefs.inodesFree<0%,imagefs.available<0% + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + name: '{{ ds.meta_data.hostname }}' + users: + - name: "mySshUsername" + lockPassword: false + sudo: ALL=(ALL) NOPASSWD:ALL + sshAuthorizedKeys: + - "mySshAuthorizedKey" + +--- diff --git a/pkg/providers/nutanix/validator.go b/pkg/providers/nutanix/validator.go index 827d91b20652..1e23c924292f 100644 --- a/pkg/providers/nutanix/validator.go +++ b/pkg/providers/nutanix/validator.go @@ -67,6 +67,10 @@ func (v *Validator) ValidateClusterSpec(ctx context.Context, spec *cluster.Spec, } } + if err := v.validateFreeGPU(ctx, client, spec); err != nil { + return err + } + return v.checkImageNameMatchesKubernetesVersion(ctx, spec, client) } @@ -274,6 +278,14 @@ func (v *Validator) ValidateMachineConfig(ctx context.Context, client Client, co } } + if config.Spec.GPUs != nil { + for _, gpu := range config.Spec.GPUs { + if err := v.validateGPUConfig(gpu); err != nil { + return err + } + } + } + return nil } @@ -441,6 +453,243 @@ func (v *Validator) validateAdditionalCategories(ctx context.Context, client Cli return nil } +func (v *Validator) validateGPUConfig(gpu anywherev1.NutanixGPUIdentifier) error { + if gpu.Type == "" { + return fmt.Errorf("missing GPU type") + } + + if gpu.Type != anywherev1.NutanixGPUIdentifierDeviceID && gpu.Type != anywherev1.NutanixGPUIdentifierName { + return fmt.Errorf("invalid GPU identifier type: %s; valid types are: %q and %q", gpu.Type, anywherev1.NutanixGPUIdentifierDeviceID, anywherev1.NutanixGPUIdentifierName) + } + + if gpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { + if gpu.DeviceID == nil { + return fmt.Errorf("missing GPU device ID") + } + } else { + if gpu.Name == "" { + return fmt.Errorf("missing GPU name") + } + } + + return nil +} + +func getRequestedGPUsForAllMachines(machineCount int, requestedGpus []anywherev1.NutanixGPUIdentifier) []anywherev1.NutanixGPUIdentifier { + allMachinesRequestedGPUs := make([]anywherev1.NutanixGPUIdentifier, 0) + for i := 0; i < machineCount; i++ { + allMachinesRequestedGPUs = append(allMachinesRequestedGPUs, requestedGpus...) + } + return allMachinesRequestedGPUs +} + +func (v *Validator) tryAssignGPUsToMachineConfig(machineCount int, requestedGpus []anywherev1.NutanixGPUIdentifier, clusterGpuList []v3.GPU, cluster anywherev1.NutanixResourceIdentifier) ([]v3.GPU, error) { + allMachinesRequestedGPUs := getRequestedGPUsForAllMachines(machineCount, requestedGpus) + + for _, requestedGpu := range allMachinesRequestedGPUs { + found := -1 + for index, gpu := range clusterGpuList { + if isRequestedGPUAssignable(gpu, requestedGpu) { + found = index + break + } + } + + if found == -1 { + return nil, errorGPUNotFound(requestedGpu, cluster) + } + + clusterGpuList = append(clusterGpuList[:found], clusterGpuList[found+1:]...) + } + + return clusterGpuList, nil +} + +func (v *Validator) isGPURequested(configs map[string]*anywherev1.NutanixMachineConfig) bool { + for _, machineConfig := range configs { + if machineConfig.Spec.GPUs != nil { + return true + } + } + + return false +} + +func (v *Validator) initAvailableGPUsMap(hosts []*v3.HostResponse) map[string][]v3.GPU { + availableGpu := make(map[string][]v3.GPU) + for _, host := range hosts { + if host.Status != nil && + host.Status.Resources != nil && + host.Status.ClusterReference != nil && + host.Status.ClusterReference.UUID != "" { + clusterUUID := host.Status.ClusterReference.UUID + + availableGpu[clusterUUID] = make([]v3.GPU, 0) + } + } + return availableGpu +} + +func (v *Validator) getAvailableGPUs(hosts []*v3.HostResponse) (map[string][]v3.GPU, error) { + availableGpu := v.initAvailableGPUsMap(hosts) + + for _, host := range hosts { + if host.Status != nil && + host.Status.Resources != nil && + host.Status.Resources.GPUList != nil && + host.Status.ClusterReference != nil && + host.Status.ClusterReference.UUID != "" { + clusterUUID := host.Status.ClusterReference.UUID + + for _, gpu := range host.Status.Resources.GPUList { + availableGpu[clusterUUID] = append(availableGpu[clusterUUID], *gpu) + } + } + } + + return availableGpu, nil +} + +func (v *Validator) tryAssignGPUsToAllMachineConfigs(ctx context.Context, v3Client Client, cluster *cluster.Spec, availableGpu map[string][]v3.GPU) error { + configs := cluster.NutanixMachineConfigs + machineCount := v.getMachineCountForAllMachineConfigs(cluster) + + for _, machineConfig := range configs { + clusterUUID, err := getClusterUUID(ctx, v3Client, machineConfig.Spec.Cluster) + if err != nil { + return err + } + + if machineConfig.Spec.GPUs != nil { + if _, ok := machineCount[machineConfig.Name]; ok { + availableGpu[clusterUUID], err = v.tryAssignGPUsToMachineConfig(machineCount[machineConfig.Name], machineConfig.Spec.GPUs, availableGpu[clusterUUID], machineConfig.Spec.Cluster) + if err != nil { + return err + } + } + } + } + + return nil +} + +func (v *Validator) getMachineCountForAllMachineConfigs(clusterSpec *cluster.Spec) map[string]int { + machineCountMap := make(map[string]int) + cluster := clusterSpec.Cluster.Spec + if cluster.ControlPlaneConfiguration.MachineGroupRef.Kind == constants.NutanixMachineConfigKind { + machineCountMap[cluster.ControlPlaneConfiguration.MachineGroupRef.Name] = cluster.ControlPlaneConfiguration.Count + } + + if cluster.ExternalEtcdConfiguration != nil && + cluster.ExternalEtcdConfiguration.MachineGroupRef.Kind == constants.NutanixMachineConfigKind { + machineCountMap[cluster.ExternalEtcdConfiguration.MachineGroupRef.Name] = cluster.ExternalEtcdConfiguration.Count + } + + for _, workerNodeGroupConfiguration := range cluster.WorkerNodeGroupConfigurations { + if workerNodeGroupConfiguration.MachineGroupRef.Kind == constants.NutanixMachineConfigKind && + workerNodeGroupConfiguration.Count != nil { + machineCountMap[workerNodeGroupConfiguration.MachineGroupRef.Name] = *workerNodeGroupConfiguration.Count + } + } + return machineCountMap +} + +func (v *Validator) getGPUModeMapping(hosts []*v3.HostResponse) (map[int64]string, map[string]string, error) { + gpuDeviceIDToMode := make(map[int64]string) + gpuNameToMode := make(map[string]string) + + for _, host := range hosts { + if host.Status != nil && + host.Status.Resources != nil && + host.Status.Resources.GPUList != nil { + for _, gpu := range host.Status.Resources.GPUList { + if gpu.DeviceID != nil { + gpuDeviceIDToMode[*gpu.DeviceID] = gpu.Mode + } + if gpu.Name != "" { + gpuNameToMode[gpu.Name] = gpu.Mode + } + } + } + } + + return gpuDeviceIDToMode, gpuNameToMode, nil +} + +func (v *Validator) validateGPUModeNotMixed(hosts []*v3.HostResponse, cluster *cluster.Spec) error { + configs := cluster.NutanixMachineConfigs + + gpuDeviceIDToMode, gpuNameToMode, err := v.getGPUModeMapping(hosts) + if err != nil { + return err + } + + gpuMode := "" + getGpuModeFunc := createGetGpuModeFunc(gpuDeviceIDToMode, gpuNameToMode) + for _, machineConfig := range configs { + if machineConfig.Spec.GPUs != nil { + for _, gpu := range machineConfig.Spec.GPUs { + if gpuMode == "" { + gpuMode = getGpuModeFunc(gpu) + } else { + if gpuMode != getGpuModeFunc(gpu) { + return fmt.Errorf("all GPUs in a machine config must be of the same mode, vGPU or passthrough") + } + } + } + } + } + + return nil +} + +func createGetGpuModeFunc(gpuDeviceIDToMode map[int64]string, gpuNameToMode map[string]string) func(gpu anywherev1.NutanixGPUIdentifier) string { + return func(gpu anywherev1.NutanixGPUIdentifier) string { + if gpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { + return gpuDeviceIDToMode[*gpu.DeviceID] + } + + return gpuNameToMode[gpu.Name] + } +} + +func (v *Validator) validateFreeGPU(ctx context.Context, v3Client Client, cluster *cluster.Spec) error { + res, err := v3Client.ListAllHost(ctx) + if err != nil || len(res.Entities) == 0 { + return fmt.Errorf("No GPUs found: %v", err) + } + + if v.isGPURequested(cluster.NutanixMachineConfigs) { + err := v.validateGPUModeNotMixed(res.Entities, cluster) + if err != nil { + return err + } + + availableGpu, err := v.getAvailableGPUs(res.Entities) + if err != nil { + return err + } + + if err = v.tryAssignGPUsToAllMachineConfigs(ctx, v3Client, cluster, availableGpu); err != nil { + return err + } + } + + return nil +} + +func (v *Validator) validateUpgradeRolloutStrategy(clusterSpec *cluster.Spec) error { + if clusterSpec.Cluster.Spec.ControlPlaneConfiguration.UpgradeRolloutStrategy != nil { + return fmt.Errorf("Upgrade rollout strategy customization is not supported for nutanix provider") + } + for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { + if workerNodeGroupConfiguration.UpgradeRolloutStrategy != nil { + return fmt.Errorf("Upgrade rollout strategy customization is not supported for nutanix provider") + } + } + return nil +} + // findSubnetUUIDByName retrieves the subnet uuid by the given subnet name. func findSubnetUUIDByName(ctx context.Context, v3Client Client, clusterUUID, subnetName string) (*string, error) { res, err := v3Client.ListSubnet(ctx, &v3.DSMetadata{ @@ -546,14 +795,29 @@ func findProjectUUIDByName(ctx context.Context, v3Client Client, projectName str return res.Entities[0].Metadata.UUID, nil } -func (v *Validator) validateUpgradeRolloutStrategy(clusterSpec *cluster.Spec) error { - if clusterSpec.Cluster.Spec.ControlPlaneConfiguration.UpgradeRolloutStrategy != nil { - return fmt.Errorf("Upgrade rollout strategy customization is not supported for nutanix provider") +func isRequestedGPUAssignable(gpu v3.GPU, requestedGpu anywherev1.NutanixGPUIdentifier) bool { + if requestedGpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { + return (*gpu.DeviceID == *requestedGpu.DeviceID) && gpu.Assignable } - for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { - if workerNodeGroupConfiguration.UpgradeRolloutStrategy != nil { - return fmt.Errorf("Upgrade rollout strategy customization is not supported for nutanix provider") + + return (gpu.Name == requestedGpu.Name) && gpu.Assignable +} + +func errorGPUNotFound(gpu anywherev1.NutanixGPUIdentifier, cluster anywherev1.NutanixResourceIdentifier) error { + clusterAddonString := "" + if cluster.Type == anywherev1.NutanixIdentifierUUID { + if cluster.UUID != nil { + clusterAddonString = fmt.Sprintf("on cluster with UUID %s", *cluster.UUID) + } + } else { + if cluster.Name != nil { + clusterAddonString = fmt.Sprintf("on cluster with name %s", *cluster.Name) } } - return nil + + if gpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { + return fmt.Errorf("GPU with device ID %d not found %s", *gpu.DeviceID, clusterAddonString) + } + + return fmt.Errorf("GPU with name %s not found %s", gpu.Name, clusterAddonString) } diff --git a/pkg/providers/nutanix/validator_test.go b/pkg/providers/nutanix/validator_test.go index e0609c415923..634249187484 100644 --- a/pkg/providers/nutanix/validator_test.go +++ b/pkg/providers/nutanix/validator_test.go @@ -21,6 +21,8 @@ import ( "github.com/aws/eks-anywhere/internal/test" "github.com/aws/eks-anywhere/pkg/api/v1alpha1" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" + "github.com/aws/eks-anywhere/pkg/cluster" + "github.com/aws/eks-anywhere/pkg/constants" mockCrypto "github.com/aws/eks-anywhere/pkg/crypto/mocks" mocknutanix "github.com/aws/eks-anywhere/pkg/providers/nutanix/mocks" "github.com/aws/eks-anywhere/pkg/utils/ptr" @@ -638,6 +640,70 @@ func TestNutanixValidatorValidateMachineConfig(t *testing.T) { }, expectedError: "failed to find category value", }, + { + name: "invalid gpu identifier type", + setup: func(machineConf *anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterList(), nil).Times(2) + mockClient.EXPECT().ListSubnet(gomock.Any(), gomock.Any()).Return(fakeSubnetList(), nil) + mockClient.EXPECT().ListImage(gomock.Any(), gomock.Any()).Return(fakeImageList(), nil) + machineConf.Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "invalid", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "invalid GPU identifier type", + }, + { + name: "missing GPU type", + setup: func(machineConf *anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterList(), nil).Times(2) + mockClient.EXPECT().ListSubnet(gomock.Any(), gomock.Any()).Return(fakeSubnetList(), nil) + mockClient.EXPECT().ListImage(gomock.Any(), gomock.Any()).Return(fakeImageList(), nil) + machineConf.Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "missing GPU type", + }, + { + name: "missing GPU device ID", + setup: func(machineConf *anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterList(), nil).Times(2) + mockClient.EXPECT().ListSubnet(gomock.Any(), gomock.Any()).Return(fakeSubnetList(), nil) + mockClient.EXPECT().ListImage(gomock.Any(), gomock.Any()).Return(fakeImageList(), nil) + machineConf.Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "deviceID", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "missing GPU device ID", + }, + { + name: "missing GPU name", + setup: func(machineConf *anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterList(), nil).Times(2) + mockClient.EXPECT().ListSubnet(gomock.Any(), gomock.Any()).Return(fakeSubnetList(), nil) + mockClient.EXPECT().ListImage(gomock.Any(), gomock.Any()).Return(fakeImageList(), nil) + machineConf.Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "missing GPU name", + }, } for _, tc := range tests { @@ -658,6 +724,584 @@ func TestNutanixValidatorValidateMachineConfig(t *testing.T) { } } +func fakeHostList() *v3.HostListResponse { + return &v3.HostListResponse{ + Entities: []*v3.HostResponse{ + { + Status: &v3.HostStatus{ + ClusterReference: &v3.ReferenceValues{ + UUID: "a15f6966-bfc7-4d1e-8575-224096fc1cdb", + }, + Resources: &v3.HostResources{ + GPUList: []*v3.GPU{ + { + Assignable: false, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: false, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(557), + Name: "NVIDIA A40-1Q", + Mode: "VIRTUAL", + }, + }, + }, + }, + }, + { + Status: &v3.HostStatus{ + ClusterReference: &v3.ReferenceValues{ + UUID: "4d69ca7d-022f-49d1-a454-74535993bda4", + }, + Resources: &v3.HostResources{ + GPUList: []*v3.GPU{ + { + Assignable: false, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: false, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + { + Assignable: true, + DeviceID: utils.Int64Ptr(8757), + Name: "Ampere 40", + Mode: "PASSTHROUGH_COMPUTE", + }, + }, + }, + }, + }, + { + Status: &v3.HostStatus{ + ClusterReference: &v3.ReferenceValues{ + UUID: "e0b1dfc7-5447-410f-b708-f9603e9be79a", + }, + Resources: &v3.HostResources{}, + }, + }, + }, + } +} + +func fakeEmptyHostList() *v3.HostListResponse { + return &v3.HostListResponse{ + Entities: []*v3.HostResponse{}, + } +} + +func fakeClusterListForFreeGPUTest() *v3.ClusterListIntentResponse { + return &v3.ClusterListIntentResponse{ + Entities: []*v3.ClusterIntentResponse{ + { + Metadata: &v3.Metadata{ + UUID: utils.StringPtr("a15f6966-bfc7-4d1e-8575-224096fc1cdb"), + }, + Spec: &v3.Cluster{ + Name: utils.StringPtr("prism-cluster"), + }, + Status: &v3.ClusterDefStatus{ + Resources: &v3.ClusterObj{ + Config: &v3.ClusterConfig{ + ServiceList: []*string{utils.StringPtr("AOS")}, + }, + }, + }, + }, + { + Metadata: &v3.Metadata{ + UUID: utils.StringPtr("4d69ca7d-022f-49d1-a454-74535993bda4"), + }, + Spec: &v3.Cluster{ + Name: utils.StringPtr("prism-cluster-1"), + }, + Status: &v3.ClusterDefStatus{ + Resources: &v3.ClusterObj{ + Config: &v3.ClusterConfig{ + ServiceList: []*string{utils.StringPtr("AOS")}, + }, + }, + }, + }, + { + Metadata: &v3.Metadata{ + UUID: utils.StringPtr("e0b1dfc7-5447-410f-b708-f9603e9be79a"), + }, + Spec: &v3.Cluster{ + Name: utils.StringPtr("prism-cluster-2"), + }, + Status: &v3.ClusterDefStatus{ + Resources: &v3.ClusterObj{ + Config: &v3.ClusterConfig{ + ServiceList: []*string{utils.StringPtr("AOS")}, + }, + }, + }, + }, + }, + } +} + +func TestNutanixValidatorValidateFreeGPU(t *testing.T) { + ctrl := gomock.NewController(t) + + tests := []struct { + name string + setup func(map[string]*anywherev1.NutanixMachineConfig, *mocknutanix.MockClient, *mockCrypto.MockTlsValidator, *mocknutanix.MockRoundTripper) *Validator + expectedError string + }{ + { + name: "not enough GPU resources available by name", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil) + machineConfigs["cp"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("a15f6966-bfc7-4d1e-8575-224096fc1cdb"), + } + machineConfigs["cp"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + } + machineConfigs["worker"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("a15f6966-bfc7-4d1e-8575-224096fc1cdb"), + } + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "Ampere 40", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "GPU with name Ampere 40 not found", + }, + { + name: "not enough GPU resources available by name in different PE (UUID)", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil) + machineConfigs["cp"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("e0b1dfc7-5447-410f-b708-f9603e9be79a"), + } + machineConfigs["cp"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{} + machineConfigs["worker"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("a15f6966-bfc7-4d1e-8575-224096fc1cdb"), + } + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "Ampere 40", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "GPU with name Ampere 40 not found on cluster with UUID a15f6966-bfc7-4d1e-8575-224096fc1cdb", + }, + { + name: "not enough GPU resources available by deviceID in different PE (UUID)", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil) + machineConfigs["cp"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("e0b1dfc7-5447-410f-b708-f9603e9be79a"), + } + machineConfigs["cp"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{} + machineConfigs["worker"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("a15f6966-bfc7-4d1e-8575-224096fc1cdb"), + } + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "GPU with device ID 8757 not found on cluster with UUID a15f6966-bfc7-4d1e-8575-224096fc1cdb", + }, + { + name: "not enough GPU resources available by deviceID", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil) + machineConfigs["cp"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("a15f6966-bfc7-4d1e-8575-224096fc1cdb"), + } + machineConfigs["cp"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + } + machineConfigs["worker"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("a15f6966-bfc7-4d1e-8575-224096fc1cdb"), + } + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "Ampere 40", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "GPU with device ID 8757 not found", + }, + { + name: "no GPU resources found", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeEmptyHostList(), nil) + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + { + Type: "name", + Name: "Ampere 40", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "No GPUs found", + }, + { + name: "no GPU resources found: ListAllHost failed", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(nil, fmt.Errorf("failed to list hosts")) + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + { + Type: "name", + Name: "Ampere 40", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "No GPUs found", + }, + { + name: "mixed passthrough and vGPU mode GPUs in a machine config", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil) + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "name", + Name: "NVIDIA A40-1Q", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "all GPUs in a machine config must be of the same mode, vGPU or passthrough", + }, + { + name: "GPUs validation successful", + setup: func(machineConfigs map[string]*anywherev1.NutanixMachineConfig, mockClient *mocknutanix.MockClient, validator *mockCrypto.MockTlsValidator, transport *mocknutanix.MockRoundTripper) *Validator { + mockClient.EXPECT().ListCluster(gomock.Any(), gomock.Any()).Return(fakeClusterListForFreeGPUTest(), nil).AnyTimes() + mockClient.EXPECT().ListAllHost(gomock.Any()).Return(fakeHostList(), nil) + machineConfigs["cp"].Spec.Cluster = anywherev1.NutanixResourceIdentifier{ + Type: anywherev1.NutanixIdentifierUUID, + UUID: utils.StringPtr("4d69ca7d-022f-49d1-a454-74535993bda4"), + } + machineConfigs["cp"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + { + Type: "deviceID", + DeviceID: utils.Int64Ptr(8757), + }, + } + + machineConfigs["worker"].Spec.GPUs = []anywherev1.NutanixGPUIdentifier{ + { + Type: "name", + Name: "Ampere 40", + }, + } + clientCache := &ClientCache{clients: map[string]Client{"test": mockClient}} + return NewValidator(clientCache, validator, &http.Client{Transport: transport}) + }, + expectedError: "", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + machineConfigsNames := []string{"cp", "etcd", "worker"} + machineConfigs := make(map[string]*anywherev1.NutanixMachineConfig) + + for _, name := range machineConfigsNames { + machineConfigs[name] = &anywherev1.NutanixMachineConfig{} + err := yaml.Unmarshal([]byte(nutanixMachineConfigSpec), machineConfigs[name]) + machineConfigs[name].Name = machineConfigs[name].Name + "-" + name + require.NoError(t, err) + } + + mockClient := mocknutanix.NewMockClient(ctrl) + validator := tc.setup(machineConfigs, mockClient, mockCrypto.NewMockTlsValidator(ctrl), mocknutanix.NewMockRoundTripper(ctrl)) + spec := &cluster.Spec{ + Config: &cluster.Config{ + Cluster: &anywherev1.Cluster{ + Spec: anywherev1.ClusterSpec{ + ControlPlaneConfiguration: anywherev1.ControlPlaneConfiguration{ + Count: 1, + MachineGroupRef: &anywherev1.Ref{ + Name: "eksa-unit-test-cp", + Kind: constants.NutanixMachineConfigKind, + }, + }, + WorkerNodeGroupConfigurations: []anywherev1.WorkerNodeGroupConfiguration{ + { + Count: utils.IntPtr(2), + MachineGroupRef: &anywherev1.Ref{ + Name: "eksa-unit-test-worker", + Kind: constants.NutanixMachineConfigKind, + }, + }, + }, + ExternalEtcdConfiguration: &anywherev1.ExternalEtcdConfiguration{ + Count: 1, + MachineGroupRef: &anywherev1.Ref{ + Name: "eksa-unit-test-etcd", + Kind: constants.NutanixMachineConfigKind, + }, + }, + }, + }, + NutanixMachineConfigs: machineConfigs, + }, + } + err := validator.validateFreeGPU(context.Background(), mockClient, spec) + if tc.expectedError != "" { + assert.Contains(t, err.Error(), tc.expectedError) + } else { + assert.NoError(t, err) + } + }) + } +} + func TestNutanixValidatorValidateDatacenterConfig(t *testing.T) { tests := []struct { name string