diff --git a/cluster-autoscaler/cloudprovider/oci/README.md b/cluster-autoscaler/cloudprovider/oci/README.md index e0d883ce5074..640e70607d86 100644 --- a/cluster-autoscaler/cloudprovider/oci/README.md +++ b/cluster-autoscaler/cloudprovider/oci/README.md @@ -166,6 +166,13 @@ use-instance-principals = true n/a +### Node Group Auto Discovery +`--node-group-auto-discovery` could be given in below pattern. It would discover the nodepools under given compartment by matching the nodepool tags (either they are Freeform or Defined tags). All of the parameters are mandatory. +``` +clusterId:,compartmentId:,nodepoolTags:=&=,min:,max: +``` +Auto discovery can not be used along with static discovery (`node` parameter) to prevent conflicts. + ## Deployment ### Create OCI config secret (only if _not_ using Instance Principals) @@ -271,7 +278,8 @@ kubectl apply -f ./cloudprovider/oci/examples/oci-nodepool-cluster-autoscaler-w- correctly (`oci-cloud-controller-manager`). - Avoid manually changing pools that are managed by the Cluster Autoscaler. For example, do not add or remove nodes using kubectl, or using the Console (or the Oracle Cloud Infrastructure CLI or API). -- `--node-group-auto-discovery` and `--node-autoprovisioning-enabled=true` are not supported. +- `--node-autoprovisioning-enabled=true` are not supported. +- `--node-group-auto-discovery` and `node` parameters can not be used together as it can cause conflicts. - We set a `nvidia.com/gpu:NoSchedule` taint on nodes in a GPU enabled pools. ## Helpful links diff --git a/cluster-autoscaler/cloudprovider/oci/common/oci_util.go b/cluster-autoscaler/cloudprovider/oci/common/oci_util.go index 781cc01c5337..d1b6c91aa4b5 100644 --- a/cluster-autoscaler/cloudprovider/oci/common/oci_util.go +++ b/cluster-autoscaler/cloudprovider/oci/common/oci_util.go @@ -205,3 +205,24 @@ func GetAllPoolTypes(groups []string) (string, error) { } return ocidType, nil } + +// HasNodeGroupTags checks if nodepoolTags is provided +func HasNodeGroupTags(nodeGroupAutoDiscoveryList []string) (bool, bool, error) { + instancePoolTagsFound := false + nodePoolTagsFound := false + for _, arg := range nodeGroupAutoDiscoveryList { + if strings.Contains(arg, "nodepoolTags") { + nodePoolTagsFound = true + } + if strings.Contains(arg, "instancepoolTags") { + instancePoolTagsFound = true + } + } + if instancePoolTagsFound == true && nodePoolTagsFound == true { + return instancePoolTagsFound, nodePoolTagsFound, fmt.Errorf("can not use both instancepoolTags and nodepoolTags in node-group-auto-discovery") + } + if len(nodeGroupAutoDiscoveryList) > 0 && instancePoolTagsFound == false && nodePoolTagsFound == false { + return instancePoolTagsFound, nodePoolTagsFound, fmt.Errorf("either instancepoolTags or nodepoolTags should be provided in node-group-auto-discovery") + } + return instancePoolTagsFound, nodePoolTagsFound, nil +} diff --git a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go index dfcc1728f7f2..425f4350331c 100644 --- a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go @@ -153,8 +153,14 @@ func BuildOCI(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover if err != nil { klog.Fatalf("Failed to get pool type: %v", err) } - if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) { - manager, err := nodepools.CreateNodePoolManager(opts.CloudConfig, do, createKubeClient(opts)) + _, nodepoolTagsFound, err := ocicommon.HasNodeGroupTags(opts.NodeGroupAutoDiscovery) + if err != nil { + klog.Fatalf("Failed to get auto discovery tags: %v", err) + } + if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) && nodepoolTagsFound == true { + klog.Fatalf("-nodes and -node-group-auto-discovery parameters can not be used together.") + } else if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) || nodepoolTagsFound == true { + manager, err := nodepools.CreateNodePoolManager(opts.CloudConfig, opts.NodeGroupAutoDiscovery, do, createKubeClient(opts)) if err != nil { klog.Fatalf("Could not create OCI OKE cloud provider: %v", err) } diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go index 6f04b563c3ba..8fdd35b38bf9 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go @@ -9,6 +9,7 @@ import ( "fmt" "math" "os" + "regexp" "strconv" "strings" "time" @@ -34,6 +35,11 @@ import ( const ( maxAddTaintRetries = 5 maxGetNodepoolRetries = 3 + clusterId = "clusterId" + compartmentId = "compartmentId" + nodepoolTags = "nodepoolTags" + min = "min" + max = "max" ) var ( @@ -75,10 +81,11 @@ type okeClient interface { GetNodePool(context.Context, oke.GetNodePoolRequest) (oke.GetNodePoolResponse, error) UpdateNodePool(context.Context, oke.UpdateNodePoolRequest) (oke.UpdateNodePoolResponse, error) DeleteNode(context.Context, oke.DeleteNodeRequest) (oke.DeleteNodeResponse, error) + ListNodePools(ctx context.Context, request oke.ListNodePoolsRequest) (oke.ListNodePoolsResponse, error) } // CreateNodePoolManager creates an NodePoolManager that can manage autoscaling node pools -func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (NodePoolManager, error) { +func CreateNodePoolManager(cloudConfigPath string, nodeGroupAutoDiscoveryList []string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (NodePoolManager, error) { var err error var configProvider common.ConfigurationProvider @@ -151,6 +158,20 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N nodePoolCache: newNodePoolCache(&okeClient), } + // auto discover nodepools from compartments with nodeGroupAutoDiscovery parameter + klog.Infof("checking node groups for autodiscovery ... ") + for _, arg := range nodeGroupAutoDiscoveryList { + nodeGroup, err := nodeGroupFromArg(arg) + if err != nil { + return nil, fmt.Errorf("unable to construct node group auto discovery from argument: %v", err) + } + nodeGroup.manager = manager + nodeGroup.kubeClient = kubeClient + + manager.nodeGroups = append(manager.nodeGroups, *nodeGroup) + autoDiscoverNodeGroups(manager, manager.okeClient, *nodeGroup) + } + // Contains all the specs from the args that give us the pools. for _, arg := range discoveryOpts.NodeGroupSpecs { np, err := nodePoolFromArg(arg) @@ -180,6 +201,48 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N return manager, nil } +func autoDiscoverNodeGroups(m *ociManagerImpl, okeClient okeClient, nodeGroup nodeGroupAutoDiscovery) (bool, error) { + var resp, reqErr = okeClient.ListNodePools(context.Background(), oke.ListNodePoolsRequest{ + ClusterId: common.String(nodeGroup.clusterId), + CompartmentId: common.String(nodeGroup.compartmentId), + }) + if reqErr != nil { + klog.Errorf("failed to fetch the nodepool list with clusterId: %s, compartmentId: %s. Error: %v", nodeGroup.clusterId, nodeGroup.compartmentId, reqErr) + return false, reqErr + } + for _, nodePoolSummary := range resp.Items { + if validateNodepoolTags(nodeGroup.tags, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags) { + nodepool := &nodePool{} + nodepool.id = *nodePoolSummary.Id + nodepool.minSize = nodeGroup.minSize + nodepool.maxSize = nodeGroup.maxSize + + nodepool.manager = nodeGroup.manager + nodepool.kubeClient = nodeGroup.kubeClient + + m.staticNodePools[nodepool.id] = nodepool + klog.V(5).Infof("auto discovered nodepool in compartment : %s , nodepoolid: %s", nodeGroup.compartmentId, nodepool.id) + } else { + klog.Warningf("nodepool ignored as the tags do not satisfy the requirement : %s , %v, %v", *nodePoolSummary.Id, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags) + } + } + return true, nil +} + +func validateNodepoolTags(nodeGroupTags map[string]string, freeFormTags map[string]string, definedTags map[string]map[string]interface{}) bool { + if nodeGroupTags != nil { + for tagKey, tagValue := range nodeGroupTags { + namespacedTagKey := strings.Split(tagKey, ".") + if len(namespacedTagKey) == 2 && tagValue != definedTags[namespacedTagKey[0]][namespacedTagKey[1]] { + return false + } else if len(namespacedTagKey) != 2 && tagValue != freeFormTags[tagKey] { + return false + } + } + } + return true +} + // nodePoolFromArg parses a node group spec represented in the form of `::` and produces a node group spec object func nodePoolFromArg(value string) (*nodePool, error) { tokens := strings.SplitN(value, ":", 3) @@ -207,6 +270,81 @@ func nodePoolFromArg(value string) (*nodePool, error) { return spec, nil } +// nodeGroupFromArg parses a node group spec represented in the form of +// `clusterId:,compartmentId:,nodepoolTags:=&=,min:,max:` +// and produces a node group auto discovery object +func nodeGroupFromArg(value string) (*nodeGroupAutoDiscovery, error) { + // this regex will find the key-value pairs in any given order if separated with a colon + regexPattern := `(?:` + compartmentId + `:(?P<` + compartmentId + `>[^,]+)` + regexPattern = regexPattern + `|` + nodepoolTags + `:(?P<` + nodepoolTags + `>[^,]+)` + regexPattern = regexPattern + `|` + max + `:(?P<` + max + `>[^,]+)` + regexPattern = regexPattern + `|` + min + `:(?P<` + min + `>[^,]+)` + regexPattern = regexPattern + `|` + clusterId + `:(?P<` + clusterId + `>[^,]+)` + regexPattern = regexPattern + `)(?:,|$)` + + re := regexp.MustCompile(regexPattern) + + parametersMap := make(map[string]string) + + // push key-value pairs into a map + for _, match := range re.FindAllStringSubmatch(value, -1) { + for i, name := range re.SubexpNames() { + if i != 0 && match[i] != "" { + parametersMap[name] = match[i] + } + } + } + + spec := &nodeGroupAutoDiscovery{} + + if parametersMap[clusterId] != "" { + spec.clusterId = parametersMap[clusterId] + } else { + return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", clusterId) + } + + if parametersMap[compartmentId] != "" { + spec.compartmentId = parametersMap[compartmentId] + } else { + return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", compartmentId) + } + + if size, err := strconv.Atoi(parametersMap[min]); err == nil { + spec.minSize = size + } else { + return nil, fmt.Errorf("failed to set %s size: %s, expected integer", min, parametersMap[min]) + } + + if size, err := strconv.Atoi(parametersMap[max]); err == nil { + spec.maxSize = size + } else { + return nil, fmt.Errorf("failed to set %s size: %s, expected integer", max, parametersMap[max]) + } + + if parametersMap[nodepoolTags] != "" { + nodepoolTags := parametersMap[nodepoolTags] + + spec.tags = make(map[string]string) + + pairs := strings.Split(nodepoolTags, "&") + + for _, pair := range pairs { + parts := strings.Split(pair, "=") + if len(parts) == 2 { + spec.tags[parts[0]] = parts[1] + } else { + return nil, fmt.Errorf("nodepoolTags should be given in tagKey=tagValue format, this is not valid: %s", pair) + } + } + } else { + return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", nodepoolTags) + } + + klog.Infof("node group auto discovery spec constructed: %+v", spec) + + return spec, nil +} + type ociManagerImpl struct { cfg *ocicommon.CloudConfig okeClient okeClient @@ -215,6 +353,7 @@ type ociManagerImpl struct { ociTagsGetter ocicommon.TagsGetter registeredTaintsGetter RegisteredTaintsGetter staticNodePools map[string]NodePool + nodeGroups []nodeGroupAutoDiscovery lastRefresh time.Time @@ -253,6 +392,15 @@ func (m *ociManagerImpl) TaintToPreventFurtherSchedulingOnRestart(nodes []*apiv1 } func (m *ociManagerImpl) forceRefresh() error { + // auto discover node groups + if m.nodeGroups != nil { + // empty previous nodepool map to do an auto discovery + m.staticNodePools = make(map[string]NodePool) + for _, nodeGroup := range m.nodeGroups { + autoDiscoverNodeGroups(m, m.okeClient, nodeGroup) + } + } + // rebuild nodepool cache err := m.nodePoolCache.rebuild(m.staticNodePools, maxGetNodepoolRetries) if err != nil { return err diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go index 838d0ce199d1..80b7b1a64b66 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go @@ -336,6 +336,10 @@ func (c mockOKEClient) DeleteNode(context.Context, oke.DeleteNodeRequest) (oke.D }, nil } +func (c mockOKEClient) ListNodePools(context.Context, oke.ListNodePoolsRequest) (oke.ListNodePoolsResponse, error) { + return oke.ListNodePoolsResponse{}, nil +} + func TestRemoveInstance(t *testing.T) { instanceId1 := "instance1" instanceId2 := "instance2" @@ -384,3 +388,96 @@ func TestRemoveInstance(t *testing.T) { } } } + +func TestNodeGroupFromArg(t *testing.T) { + var nodeGroupArg = "clusterId:ocid1.cluster.oc1.test-region.test,compartmentId:ocid1.compartment.oc1.test-region.test,nodepoolTags:ca-managed=true&namespace.foo=bar,min:1,max:5" + nodeGroupAutoDiscovery, err := nodeGroupFromArg(nodeGroupArg) + if err != nil { + t.Errorf("Error: #{err}") + } + if nodeGroupAutoDiscovery.clusterId != "ocid1.cluster.oc1.test-region.test" { + t.Errorf("Error: clusterId should be ocid1.cluster.oc1.test-region.test") + } + if nodeGroupAutoDiscovery.compartmentId != "ocid1.compartment.oc1.test-region.test" { + t.Errorf("Error: compartmentId should be ocid1.compartment.oc1.test-region.test") + } + if nodeGroupAutoDiscovery.minSize != 1 { + t.Errorf("Error: minSize should be 1") + } + if nodeGroupAutoDiscovery.maxSize != 5 { + t.Errorf("Error: maxSize should be 5") + } + if nodeGroupAutoDiscovery.tags["ca-managed"] != "true" { + t.Errorf("Error: ca-managed:true is missing in tags.") + } + if nodeGroupAutoDiscovery.tags["namespace.foo"] != "bar" { + t.Errorf("Error: namespace.foo:bar is missing in tags.") + } +} + +func TestValidateNodePoolTags(t *testing.T) { + + testCases := map[string]struct { + nodeGroupTags map[string]string + freeFormTags map[string]string + definedTags map[string]map[string]interface{} + expectedResult bool + }{ + "no-tags": { + nodeGroupTags: nil, + freeFormTags: nil, + definedTags: nil, + expectedResult: true, + }, + "node-group tags provided but no tags on nodepool": { + nodeGroupTags: map[string]string{ + "testTag": "testTagValue", + }, + freeFormTags: nil, + definedTags: nil, + expectedResult: false, + }, + "node-group tags and free-form tags do not match": { + nodeGroupTags: map[string]string{ + "testTag": "testTagValue", + }, + freeFormTags: map[string]string{ + "foo": "bar", + }, + definedTags: nil, + expectedResult: false, + }, + "free-form tags have required node-group tags": { + nodeGroupTags: map[string]string{ + "testTag": "testTagValue", + }, + freeFormTags: map[string]string{ + "foo": "bar", + "testTag": "testTagValue", + }, + definedTags: nil, + expectedResult: true, + }, + "defined tags have required node-group tags": { + nodeGroupTags: map[string]string{ + "ns.testTag": "testTagValue", + }, + freeFormTags: nil, + definedTags: map[string]map[string]interface{}{ + "ns": { + "testTag": "testTagValue", + }, + }, + expectedResult: true, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + result := validateNodepoolTags(tc.nodeGroupTags, tc.freeFormTags, tc.definedTags) + if result != tc.expectedResult { + t.Errorf("Testcase '%s' failed: got %t ; expected %t", name, result, tc.expectedResult) + } + }) + } +} diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go index 74780d0f2633..43d656e0f76c 100644 --- a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go +++ b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go @@ -46,6 +46,17 @@ type nodePool struct { maxSize int } +type nodeGroupAutoDiscovery struct { + manager NodePoolManager + kubeClient kubernetes.Interface + + clusterId string + compartmentId string + tags map[string]string + minSize int + maxSize int +} + // MaxSize returns maximum size of the node group. func (np *nodePool) MaxSize() int { return np.maxSize