docs: Fix make docgen for metrics generation (aws#6428)

LPetro · Jul 4, 2024 · a3d8ac4 · a3d8ac4
1 parent db6c47d
commit a3d8ac4
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 45 deletions.
diff --git a/charts/karpenter/README.md b/charts/karpenter/README.md
@@ -106,7 +106,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:0.37.0 \
 | strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. |
 | terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. |
 | tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. |
-| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. |
+| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"DoNotSchedule"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. |
 | webhook.enabled | bool | `false` | Whether to enable the webhooks and webhook permissions. |
 | webhook.metrics.port | int | `8001` | The container port to use for webhook metrics. |
 | webhook.port | int | `8443` | The container port to use for the webhook. |

diff --git a/hack/docgen.sh b/hack/docgen.sh
@@ -8,9 +8,12 @@ compatibilitymatrix() {
     go run hack/docs/compatibilitymetrix_gen_docs.go website/content/en/preview/upgrading/compatibility.md hack/docs/compatibility-karpenter.yaml $versionCount
 }
 
+CONTROLLER_RUNTIME_DIR=$(go list -m -f '{{ .Dir }}' sigs.k8s.io/controller-runtime)
+AWS_SDK_GO_PROMETHEUS_DIR=$(go list -m -f '{{ .Dir }}' github.com/jonathan-innis/aws-sdk-go-prometheus)
+OPERATORPKG_DIR=$(go list -m -f '{{ .Dir }}' github.com/awslabs/operatorpkg)
 
 compatibilitymatrix
-go run hack/docs/metrics_gen_docs.go pkg/ ${KARPENTER_CORE_DIR}/pkg website/content/en/preview/reference/metrics.md
+go run hack/docs/metrics_gen_docs.go pkg/ "${KARPENTER_CORE_DIR}/pkg" "${CONTROLLER_RUNTIME_DIR}/pkg" "${AWS_SDK_GO_PROMETHEUS_DIR}" "${OPERATORPKG_DIR}/metrics" website/content/en/preview/reference/metrics.md
 go run hack/docs/instancetypes_gen_docs.go website/content/en/preview/reference/instance-types.md
 go run hack/docs/configuration_gen_docs.go website/content/en/preview/reference/settings.md
 cd charts/karpenter && helm-docs
diff --git a/hack/docs/instancetypes_gen_docs.go b/hack/docs/instancetypes_gen_docs.go
@@ -172,6 +172,7 @@ below are the resources available with some assumptions and after the instance o
 
 	// we don't want to show a few labels that will vary amongst regions
 	delete(labelNameMap, v1.LabelTopologyZone)
+	delete(labelNameMap, v1beta1.LabelTopologyZoneID)
 	delete(labelNameMap, corev1beta1.CapacityTypeLabelKey)
 
 	labelNames := lo.Keys(labelNameMap)

diff --git a/hack/docs/metrics_gen_docs.go b/hack/docs/metrics_gen_docs.go
@@ -56,12 +56,22 @@ func main() {
 		packages := getPackages(flag.Arg(i))
 		allMetrics = append(allMetrics, getMetricsFromPackages(packages...)...)
 	}
-	// Controller Runtime naming is different in that they don't specify a namespace or subsystem
+
+	// Drop some metrics
+	for _, subsystem := range []string{"rest_client", "certwatcher_read", "controller_runtime_webhook"} {
+		allMetrics = lo.Reject(allMetrics, func(m metricInfo, _ int) bool {
+			return strings.HasPrefix(m.name, subsystem)
+		})
+	}
+
+	// Controller Runtime and AWS SDK Go for Prometheus naming is different in that they don't specify a namespace or subsystem
 	// Getting the metrics requires special parsing logic
-	for i := range allMetrics {
-		if allMetrics[i].subsystem == "" && strings.HasPrefix(allMetrics[i].name, "controller_runtime_") {
-			allMetrics[i].subsystem = "controller_runtime"
-			allMetrics[i].name = strings.TrimPrefix(allMetrics[i].name, "controller_runtime_")
+	for _, subsystem := range []string{"controller_runtime", "aws_sdk_go", "client_go", "leader_election"} {
+		for i := range allMetrics {
+			if allMetrics[i].subsystem == "" && strings.HasPrefix(allMetrics[i].name, fmt.Sprintf("%s_", subsystem)) {
+				allMetrics[i].subsystem = subsystem
+				allMetrics[i].name = strings.TrimPrefix(allMetrics[i].name, fmt.Sprintf("%s_", subsystem))
+			}
 		}
 	}
 	sort.Slice(allMetrics, bySubsystem(allMetrics))
@@ -91,7 +101,11 @@ description: >
 		if metric.subsystem != previousSubsystem {
 			if metric.subsystem != "" {
 				subsystemTitle := strings.Join(lo.Map(strings.Split(metric.subsystem, "_"), func(s string, _ int) string {
-					return fmt.Sprintf("%s%s", strings.ToTitle(s[0:1]), s[1:])
+					if s == "sdk" || s == "aws" {
+						return strings.ToUpper(s)
+					} else {
+						return fmt.Sprintf("%s%s", strings.ToUpper(s[0:1]), s[1:])
+					}
 				}), " ")
 				fmt.Fprintf(f, "## %s Metrics\n", subsystemTitle)
 				fmt.Fprintln(f)
@@ -162,11 +176,15 @@ func bySubsystem(metrics []metricInfo) func(i int, j int) bool {
 	// Higher ordering comes first. If a value isn't designated here then the subsystem will be given a default of 0.
 	// Metrics without a subsystem come first since there is no designation for the bucket they fall under
 	subSystemSortOrder := map[string]int{
-		"":          100,
-		"nodepool":  10,
-		"nodeclaim": 9,
-		"nodes":     8,
-		"pods":      7,
+		"":                100,
+		"nodepool":        10,
+		"nodeclaims":      9,
+		"nodes":           8,
+		"pods":            7,
+		"workqueue":       -1,
+		"client_go":       -1,
+		"aws_sdk_go":      -1,
+		"leader_election": -2,
 	}
 
 	return func(i, j int) bool {
@@ -226,7 +244,8 @@ func handleVariableDeclaration(v *ast.GenDecl) []metricInfo {
 					} else {
 						value = v
 					}
-
+				case *ast.BinaryExpr:
+					value = getBinaryExpr(val)
 				default:
 					log.Fatalf("unsupported value %T %v", kv.Value, kv.Value)
 				}
@@ -261,20 +280,54 @@ func getFuncPackage(fun ast.Expr) string {
 	if iexpr, ok := fun.(*ast.IndexExpr); ok {
 		return getFuncPackage(iexpr.X)
 	}
+	if _, ok := fun.(*ast.FuncLit); ok {
+		return ""
+	}
 	log.Fatalf("unsupported func expression %T, %v", fun, fun)
 	return ""
 }
 
+func getBinaryExpr(b *ast.BinaryExpr) string {
+	var x, y string
+	switch val := b.X.(type) {
+	case *ast.BasicLit:
+		x = strings.Trim(val.Value, `"`)
+	case *ast.BinaryExpr:
+		x = getBinaryExpr(val)
+	default:
+		log.Fatalf("unsupported value %T %v", val, val)
+	}
+	switch val := b.Y.(type) {
+	case *ast.BasicLit:
+		y = strings.Trim(val.Value, `"`)
+	case *ast.BinaryExpr:
+		y = getBinaryExpr(val)
+	default:
+		log.Fatalf("unsupported value %T %v", val, val)
+	}
+	return x + y
+}
+
 // we cannot get the value of an Identifier directly so we map it manually instead
 func getIdentMapping(identName string) (string, error) {
 	identMapping := map[string]string{
 		"metrics.Namespace": metrics.Namespace,
 		"Namespace":         metrics.Namespace,
 
-		"NodeSubsystem":         "nodes",
-		"metrics.NodeSubsystem": "nodes",
-		"machineSubsystem":      "machines",
-		"nodeClaimSubsystem":    "nodeclaims",
+		"WorkQueueSubsystem":         "workqueue",
+		"DepthKey":                   "depth",
+		"AddsKey":                    "adds_total",
+		"QueueLatencyKey":            "queue_duration_seconds",
+		"WorkDurationKey":            "work_duration_seconds",
+		"UnfinishedWorkKey":          "unfinished_work_seconds",
+		"LongestRunningProcessorKey": "longest_running_processor_seconds",
+		"RetriesKey":                 "retries_total",
+
+		"NodeSubsystem":              "nodes",
+		"metrics.NodeSubsystem":      "nodes",
+		"machineSubsystem":           "machines",
+		"NodeClaimSubsystem":         "nodeclaims",
+		"metrics.NodeClaimSubsystem": "nodeclaims",
 		// TODO @joinnis: We should eventually change this subsystem to be
 		// plural so that it aligns with the other subsystems
 		"nodePoolSubsystem":       "nodepool",

diff --git a/website/content/en/preview/reference/instance-types.md b/website/content/en/preview/reference/instance-types.md
@@ -6537,7 +6537,6 @@ below are the resources available with some assumptions and after the instance o
  |kubernetes.io/arch|arm64|
  |kubernetes.io/os|linux|
  |node.kubernetes.io/instance-type|hpc7g.4xlarge|
- |topology.k8s.aws/zone-id|6419929671613507071|
 #### Resources
  | Resource | Quantity |
  |--|--|
@@ -6564,7 +6563,6 @@ below are the resources available with some assumptions and after the instance o
  |kubernetes.io/arch|arm64|
  |kubernetes.io/os|linux|
  |node.kubernetes.io/instance-type|hpc7g.8xlarge|
- |topology.k8s.aws/zone-id|3124717047704565898|
 #### Resources
  | Resource | Quantity |
  |--|--|
@@ -6591,7 +6589,6 @@ below are the resources available with some assumptions and after the instance o
  |kubernetes.io/arch|arm64|
  |kubernetes.io/os|linux|
  |node.kubernetes.io/instance-type|hpc7g.16xlarge|
- |topology.k8s.aws/zone-id|4594531912622968525|
 #### Resources
  | Resource | Quantity |
  |--|--|
@@ -19508,6 +19505,7 @@ below are the resources available with some assumptions and after the instance o
  |karpenter.k8s.aws/instance-generation|7|
  |karpenter.k8s.aws/instance-hypervisor|nitro|
  |karpenter.k8s.aws/instance-memory|12582912|
+ |karpenter.k8s.aws/instance-network-bandwidth|100000|
  |karpenter.k8s.aws/instance-size|224xlarge|
  |kubernetes.io/arch|amd64|
  |kubernetes.io/os|linux|
@@ -19534,6 +19532,7 @@ below are the resources available with some assumptions and after the instance o
  |karpenter.k8s.aws/instance-generation|7|
  |karpenter.k8s.aws/instance-hypervisor|nitro|
  |karpenter.k8s.aws/instance-memory|16777216|
+ |karpenter.k8s.aws/instance-network-bandwidth|200000|
  |karpenter.k8s.aws/instance-size|224xlarge|
  |kubernetes.io/arch|amd64|
  |kubernetes.io/os|linux|
@@ -19560,6 +19559,7 @@ below are the resources available with some assumptions and after the instance o
  |karpenter.k8s.aws/instance-generation|7|
  |karpenter.k8s.aws/instance-hypervisor|nitro|
  |karpenter.k8s.aws/instance-memory|25165824|
+ |karpenter.k8s.aws/instance-network-bandwidth|200000|
  |karpenter.k8s.aws/instance-size|224xlarge|
  |kubernetes.io/arch|amd64|
  |kubernetes.io/os|linux|
@@ -19586,6 +19586,7 @@ below are the resources available with some assumptions and after the instance o
  |karpenter.k8s.aws/instance-generation|7|
  |karpenter.k8s.aws/instance-hypervisor|nitro|
  |karpenter.k8s.aws/instance-memory|33554432|
+ |karpenter.k8s.aws/instance-network-bandwidth|200000|
  |karpenter.k8s.aws/instance-size|224xlarge|
  |kubernetes.io/arch|amd64|
  |kubernetes.io/os|linux|

diff --git a/website/content/en/preview/reference/metrics.md b/website/content/en/preview/reference/metrics.md
@@ -19,6 +19,35 @@ The nodepool usage is the amount of resources that have been provisioned by a pa
 ### `karpenter_nodepool_limit`
 The nodepool limits are the limits specified on the nodepool that restrict the quantity of resources provisioned. Labeled by nodepool name and resource type.
 
+## Nodeclaims Metrics
+
+### `karpenter_nodeclaims_termination_duration_seconds`
+Duration of NodeClaim termination in seconds.
+
+### `karpenter_nodeclaims_terminated`
+Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool.
+
+### `karpenter_nodeclaims_registered`
+Number of nodeclaims registered in total by Karpenter. Labeled by the owning nodepool.
+
+### `karpenter_nodeclaims_launched`
+Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool.
+
+### `karpenter_nodeclaims_instance_termination_duration_seconds`
+Duration of CloudProvider Instance termination in seconds.
+
+### `karpenter_nodeclaims_initialized`
+Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool.
+
+### `karpenter_nodeclaims_drifted`
+Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool.
+
+### `karpenter_nodeclaims_disrupted`
+Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool.
+
+### `karpenter_nodeclaims_created`
+Number of nodeclaims created in total by Karpenter. Labeled by reason the nodeclaim was created and the owning nodepool.
+
 ## Nodes Metrics
 
 ### `karpenter_nodes_total_pod_requests`
@@ -73,29 +102,6 @@ The number of pods currently waiting to be scheduled.
 ### `karpenter_provisioner_scheduling_duration_seconds`
 Duration of scheduling process in seconds.
 
-## Nodeclaims Metrics
-
-### `karpenter_nodeclaims_terminated`
-Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool.
-
-### `karpenter_nodeclaims_registered`
-Number of nodeclaims registered in total by Karpenter. Labeled by the owning nodepool.
-
-### `karpenter_nodeclaims_launched`
-Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool.
-
-### `karpenter_nodeclaims_initialized`
-Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool.
-
-### `karpenter_nodeclaims_drifted`
-Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool.
-
-### `karpenter_nodeclaims_disrupted`
-Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool.
-
-### `karpenter_nodeclaims_created`
-Number of nodeclaims created in total by Karpenter. Labeled by reason the nodeclaim was created and the owning nodepool.
-
 ## Interruption Metrics
 
 ### `karpenter_interruption_received_messages`
@@ -185,6 +191,9 @@ Size of the request batch per batcher
 
 ## Controller Runtime Metrics
 
+### `controller_runtime_terminal_reconcile_errors_total`
+Total number of terminal reconciliation errors per controller
+
 ### `controller_runtime_reconcile_total`
 Total number of reconciliations per controller
 
@@ -200,3 +209,59 @@ Maximum number of concurrent reconciles per controller
 ### `controller_runtime_active_workers`
 Number of currently used workers per controller
 
+## Workqueue Metrics
+
+### `workqueue_work_duration_seconds`
+How long in seconds processing an item from workqueue takes.
+
+### `workqueue_unfinished_work_seconds`
+How many seconds of work has been done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
+
+### `workqueue_retries_total`
+Total number of retries handled by workqueue
+
+### `workqueue_queue_duration_seconds`
+How long in seconds an item stays in workqueue before being requested
+
+### `workqueue_longest_running_processor_seconds`
+How many seconds has the longest running processor for workqueue been running.
+
+### `workqueue_depth`
+Current depth of workqueue
+
+### `workqueue_adds_total`
+Total number of adds handled by workqueue
+
+## Client Go Metrics
+
+### `client_go_request_total`
+Number of HTTP requests, partitioned by status code and method.
+
+### `client_go_request_duration_seconds`
+Request latency in seconds. Broken down by verb, group, version, kind, and subresource.
+
+## AWS SDK Go Metrics
+
+### `aws_sdk_go_request_total`
+The total number of AWS SDK Go requests
+
+### `aws_sdk_go_request_retry_count`
+The total number of AWS SDK Go retry attempts per request
+
+### `aws_sdk_go_request_duration_seconds`
+Latency of AWS SDK Go requests
+
+### `aws_sdk_go_request_attempt_total`
+The total number of AWS SDK Go request attempts
+
+### `aws_sdk_go_request_attempt_duration_seconds`
+Latency of AWS SDK Go request attempts
+
+## Leader Election Metrics
+
+### `leader_election_slowpath_total`
+Total number of slow path exercised in renewing leader leases. 'name' is the string used to identify the lease. Please make sure to group by name.
+
+### `leader_election_master_status`
+Gauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
+
diff --git a/website/content/en/preview/reference/settings.md b/website/content/en/preview/reference/settings.md
@@ -21,7 +21,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf
 | CLUSTER_NAME | \-\-cluster-name | [REQUIRED] The kubernetes cluster name for resource discovery.|
 | DISABLE_WEBHOOK | \-\-disable-webhook | Disable the admission and validation webhooks|
 | ENABLE_PROFILING | \-\-enable-profiling | Enable the profiling on the metric endpoint|
-| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift,SpotToSpotConsolidation (default = Drift=true,SpotToSpotConsolidation=false)|
+| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: SpotToSpotConsolidation (default = SpotToSpotConsolidation=false)|
 | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)|
 | INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.|
 | ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.|