Skip to content

Commit

Permalink
docs: Fix make docgen for metrics generation (aws#6428)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored and LPetro committed Jul 4, 2024
1 parent db6c47d commit a3d8ac4
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 45 deletions.
2 changes: 1 addition & 1 deletion charts/karpenter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ cosign verify public.ecr.aws/karpenter/karpenter:0.37.0 \
| strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. |
| terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. |
| tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. |
| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. |
| topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"DoNotSchedule"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. |
| webhook.enabled | bool | `false` | Whether to enable the webhooks and webhook permissions. |
| webhook.metrics.port | int | `8001` | The container port to use for webhook metrics. |
| webhook.port | int | `8443` | The container port to use for the webhook. |
Expand Down
5 changes: 4 additions & 1 deletion hack/docgen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ compatibilitymatrix() {
go run hack/docs/compatibilitymetrix_gen_docs.go website/content/en/preview/upgrading/compatibility.md hack/docs/compatibility-karpenter.yaml $versionCount
}

CONTROLLER_RUNTIME_DIR=$(go list -m -f '{{ .Dir }}' sigs.k8s.io/controller-runtime)
AWS_SDK_GO_PROMETHEUS_DIR=$(go list -m -f '{{ .Dir }}' github.com/jonathan-innis/aws-sdk-go-prometheus)
OPERATORPKG_DIR=$(go list -m -f '{{ .Dir }}' github.com/awslabs/operatorpkg)

compatibilitymatrix
go run hack/docs/metrics_gen_docs.go pkg/ ${KARPENTER_CORE_DIR}/pkg website/content/en/preview/reference/metrics.md
go run hack/docs/metrics_gen_docs.go pkg/ "${KARPENTER_CORE_DIR}/pkg" "${CONTROLLER_RUNTIME_DIR}/pkg" "${AWS_SDK_GO_PROMETHEUS_DIR}" "${OPERATORPKG_DIR}/metrics" website/content/en/preview/reference/metrics.md
go run hack/docs/instancetypes_gen_docs.go website/content/en/preview/reference/instance-types.md
go run hack/docs/configuration_gen_docs.go website/content/en/preview/reference/settings.md
cd charts/karpenter && helm-docs
1 change: 1 addition & 0 deletions hack/docs/instancetypes_gen_docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ below are the resources available with some assumptions and after the instance o

// we don't want to show a few labels that will vary amongst regions
delete(labelNameMap, v1.LabelTopologyZone)
delete(labelNameMap, v1beta1.LabelTopologyZoneID)
delete(labelNameMap, corev1beta1.CapacityTypeLabelKey)

labelNames := lo.Keys(labelNameMap)
Expand Down
85 changes: 69 additions & 16 deletions hack/docs/metrics_gen_docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,22 @@ func main() {
packages := getPackages(flag.Arg(i))
allMetrics = append(allMetrics, getMetricsFromPackages(packages...)...)
}
// Controller Runtime naming is different in that they don't specify a namespace or subsystem

// Drop some metrics
for _, subsystem := range []string{"rest_client", "certwatcher_read", "controller_runtime_webhook"} {
allMetrics = lo.Reject(allMetrics, func(m metricInfo, _ int) bool {
return strings.HasPrefix(m.name, subsystem)
})
}

// Controller Runtime and AWS SDK Go for Prometheus naming is different in that they don't specify a namespace or subsystem
// Getting the metrics requires special parsing logic
for i := range allMetrics {
if allMetrics[i].subsystem == "" && strings.HasPrefix(allMetrics[i].name, "controller_runtime_") {
allMetrics[i].subsystem = "controller_runtime"
allMetrics[i].name = strings.TrimPrefix(allMetrics[i].name, "controller_runtime_")
for _, subsystem := range []string{"controller_runtime", "aws_sdk_go", "client_go", "leader_election"} {
for i := range allMetrics {
if allMetrics[i].subsystem == "" && strings.HasPrefix(allMetrics[i].name, fmt.Sprintf("%s_", subsystem)) {
allMetrics[i].subsystem = subsystem
allMetrics[i].name = strings.TrimPrefix(allMetrics[i].name, fmt.Sprintf("%s_", subsystem))
}
}
}
sort.Slice(allMetrics, bySubsystem(allMetrics))
Expand Down Expand Up @@ -91,7 +101,11 @@ description: >
if metric.subsystem != previousSubsystem {
if metric.subsystem != "" {
subsystemTitle := strings.Join(lo.Map(strings.Split(metric.subsystem, "_"), func(s string, _ int) string {
return fmt.Sprintf("%s%s", strings.ToTitle(s[0:1]), s[1:])
if s == "sdk" || s == "aws" {
return strings.ToUpper(s)
} else {
return fmt.Sprintf("%s%s", strings.ToUpper(s[0:1]), s[1:])
}
}), " ")
fmt.Fprintf(f, "## %s Metrics\n", subsystemTitle)
fmt.Fprintln(f)
Expand Down Expand Up @@ -162,11 +176,15 @@ func bySubsystem(metrics []metricInfo) func(i int, j int) bool {
// Higher ordering comes first. If a value isn't designated here then the subsystem will be given a default of 0.
// Metrics without a subsystem come first since there is no designation for the bucket they fall under
subSystemSortOrder := map[string]int{
"": 100,
"nodepool": 10,
"nodeclaim": 9,
"nodes": 8,
"pods": 7,
"": 100,
"nodepool": 10,
"nodeclaims": 9,
"nodes": 8,
"pods": 7,
"workqueue": -1,
"client_go": -1,
"aws_sdk_go": -1,
"leader_election": -2,
}

return func(i, j int) bool {
Expand Down Expand Up @@ -226,7 +244,8 @@ func handleVariableDeclaration(v *ast.GenDecl) []metricInfo {
} else {
value = v
}

case *ast.BinaryExpr:
value = getBinaryExpr(val)
default:
log.Fatalf("unsupported value %T %v", kv.Value, kv.Value)
}
Expand Down Expand Up @@ -261,20 +280,54 @@ func getFuncPackage(fun ast.Expr) string {
if iexpr, ok := fun.(*ast.IndexExpr); ok {
return getFuncPackage(iexpr.X)
}
if _, ok := fun.(*ast.FuncLit); ok {
return ""
}
log.Fatalf("unsupported func expression %T, %v", fun, fun)
return ""
}

func getBinaryExpr(b *ast.BinaryExpr) string {
var x, y string
switch val := b.X.(type) {
case *ast.BasicLit:
x = strings.Trim(val.Value, `"`)
case *ast.BinaryExpr:
x = getBinaryExpr(val)
default:
log.Fatalf("unsupported value %T %v", val, val)
}
switch val := b.Y.(type) {
case *ast.BasicLit:
y = strings.Trim(val.Value, `"`)
case *ast.BinaryExpr:
y = getBinaryExpr(val)
default:
log.Fatalf("unsupported value %T %v", val, val)
}
return x + y
}

// we cannot get the value of an Identifier directly so we map it manually instead
func getIdentMapping(identName string) (string, error) {
identMapping := map[string]string{
"metrics.Namespace": metrics.Namespace,
"Namespace": metrics.Namespace,

"NodeSubsystem": "nodes",
"metrics.NodeSubsystem": "nodes",
"machineSubsystem": "machines",
"nodeClaimSubsystem": "nodeclaims",
"WorkQueueSubsystem": "workqueue",
"DepthKey": "depth",
"AddsKey": "adds_total",
"QueueLatencyKey": "queue_duration_seconds",
"WorkDurationKey": "work_duration_seconds",
"UnfinishedWorkKey": "unfinished_work_seconds",
"LongestRunningProcessorKey": "longest_running_processor_seconds",
"RetriesKey": "retries_total",

"NodeSubsystem": "nodes",
"metrics.NodeSubsystem": "nodes",
"machineSubsystem": "machines",
"NodeClaimSubsystem": "nodeclaims",
"metrics.NodeClaimSubsystem": "nodeclaims",
// TODO @joinnis: We should eventually change this subsystem to be
// plural so that it aligns with the other subsystems
"nodePoolSubsystem": "nodepool",
Expand Down
7 changes: 4 additions & 3 deletions website/content/en/preview/reference/instance-types.md
Original file line number Diff line number Diff line change
Expand Up @@ -6537,7 +6537,6 @@ below are the resources available with some assumptions and after the instance o
|kubernetes.io/arch|arm64|
|kubernetes.io/os|linux|
|node.kubernetes.io/instance-type|hpc7g.4xlarge|
|topology.k8s.aws/zone-id|6419929671613507071|
#### Resources
| Resource | Quantity |
|--|--|
Expand All @@ -6564,7 +6563,6 @@ below are the resources available with some assumptions and after the instance o
|kubernetes.io/arch|arm64|
|kubernetes.io/os|linux|
|node.kubernetes.io/instance-type|hpc7g.8xlarge|
|topology.k8s.aws/zone-id|3124717047704565898|
#### Resources
| Resource | Quantity |
|--|--|
Expand All @@ -6591,7 +6589,6 @@ below are the resources available with some assumptions and after the instance o
|kubernetes.io/arch|arm64|
|kubernetes.io/os|linux|
|node.kubernetes.io/instance-type|hpc7g.16xlarge|
|topology.k8s.aws/zone-id|4594531912622968525|
#### Resources
| Resource | Quantity |
|--|--|
Expand Down Expand Up @@ -19508,6 +19505,7 @@ below are the resources available with some assumptions and after the instance o
|karpenter.k8s.aws/instance-generation|7|
|karpenter.k8s.aws/instance-hypervisor|nitro|
|karpenter.k8s.aws/instance-memory|12582912|
|karpenter.k8s.aws/instance-network-bandwidth|100000|
|karpenter.k8s.aws/instance-size|224xlarge|
|kubernetes.io/arch|amd64|
|kubernetes.io/os|linux|
Expand All @@ -19534,6 +19532,7 @@ below are the resources available with some assumptions and after the instance o
|karpenter.k8s.aws/instance-generation|7|
|karpenter.k8s.aws/instance-hypervisor|nitro|
|karpenter.k8s.aws/instance-memory|16777216|
|karpenter.k8s.aws/instance-network-bandwidth|200000|
|karpenter.k8s.aws/instance-size|224xlarge|
|kubernetes.io/arch|amd64|
|kubernetes.io/os|linux|
Expand All @@ -19560,6 +19559,7 @@ below are the resources available with some assumptions and after the instance o
|karpenter.k8s.aws/instance-generation|7|
|karpenter.k8s.aws/instance-hypervisor|nitro|
|karpenter.k8s.aws/instance-memory|25165824|
|karpenter.k8s.aws/instance-network-bandwidth|200000|
|karpenter.k8s.aws/instance-size|224xlarge|
|kubernetes.io/arch|amd64|
|kubernetes.io/os|linux|
Expand All @@ -19586,6 +19586,7 @@ below are the resources available with some assumptions and after the instance o
|karpenter.k8s.aws/instance-generation|7|
|karpenter.k8s.aws/instance-hypervisor|nitro|
|karpenter.k8s.aws/instance-memory|33554432|
|karpenter.k8s.aws/instance-network-bandwidth|200000|
|karpenter.k8s.aws/instance-size|224xlarge|
|kubernetes.io/arch|amd64|
|kubernetes.io/os|linux|
Expand Down
111 changes: 88 additions & 23 deletions website/content/en/preview/reference/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,35 @@ The nodepool usage is the amount of resources that have been provisioned by a pa
### `karpenter_nodepool_limit`
The nodepool limits are the limits specified on the nodepool that restrict the quantity of resources provisioned. Labeled by nodepool name and resource type.

## Nodeclaims Metrics

### `karpenter_nodeclaims_termination_duration_seconds`
Duration of NodeClaim termination in seconds.

### `karpenter_nodeclaims_terminated`
Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool.

### `karpenter_nodeclaims_registered`
Number of nodeclaims registered in total by Karpenter. Labeled by the owning nodepool.

### `karpenter_nodeclaims_launched`
Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool.

### `karpenter_nodeclaims_instance_termination_duration_seconds`
Duration of CloudProvider Instance termination in seconds.

### `karpenter_nodeclaims_initialized`
Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool.

### `karpenter_nodeclaims_drifted`
Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool.

### `karpenter_nodeclaims_disrupted`
Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool.

### `karpenter_nodeclaims_created`
Number of nodeclaims created in total by Karpenter. Labeled by reason the nodeclaim was created and the owning nodepool.

## Nodes Metrics

### `karpenter_nodes_total_pod_requests`
Expand Down Expand Up @@ -73,29 +102,6 @@ The number of pods currently waiting to be scheduled.
### `karpenter_provisioner_scheduling_duration_seconds`
Duration of scheduling process in seconds.

## Nodeclaims Metrics

### `karpenter_nodeclaims_terminated`
Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool.

### `karpenter_nodeclaims_registered`
Number of nodeclaims registered in total by Karpenter. Labeled by the owning nodepool.

### `karpenter_nodeclaims_launched`
Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool.

### `karpenter_nodeclaims_initialized`
Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool.

### `karpenter_nodeclaims_drifted`
Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool.

### `karpenter_nodeclaims_disrupted`
Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool.

### `karpenter_nodeclaims_created`
Number of nodeclaims created in total by Karpenter. Labeled by reason the nodeclaim was created and the owning nodepool.

## Interruption Metrics

### `karpenter_interruption_received_messages`
Expand Down Expand Up @@ -185,6 +191,9 @@ Size of the request batch per batcher

## Controller Runtime Metrics

### `controller_runtime_terminal_reconcile_errors_total`
Total number of terminal reconciliation errors per controller

### `controller_runtime_reconcile_total`
Total number of reconciliations per controller

Expand All @@ -200,3 +209,59 @@ Maximum number of concurrent reconciles per controller
### `controller_runtime_active_workers`
Number of currently used workers per controller

## Workqueue Metrics

### `workqueue_work_duration_seconds`
How long in seconds processing an item from workqueue takes.

### `workqueue_unfinished_work_seconds`
How many seconds of work has been done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.

### `workqueue_retries_total`
Total number of retries handled by workqueue

### `workqueue_queue_duration_seconds`
How long in seconds an item stays in workqueue before being requested

### `workqueue_longest_running_processor_seconds`
How many seconds has the longest running processor for workqueue been running.

### `workqueue_depth`
Current depth of workqueue

### `workqueue_adds_total`
Total number of adds handled by workqueue

## Client Go Metrics

### `client_go_request_total`
Number of HTTP requests, partitioned by status code and method.

### `client_go_request_duration_seconds`
Request latency in seconds. Broken down by verb, group, version, kind, and subresource.

## AWS SDK Go Metrics

### `aws_sdk_go_request_total`
The total number of AWS SDK Go requests

### `aws_sdk_go_request_retry_count`
The total number of AWS SDK Go retry attempts per request

### `aws_sdk_go_request_duration_seconds`
Latency of AWS SDK Go requests

### `aws_sdk_go_request_attempt_total`
The total number of AWS SDK Go request attempts

### `aws_sdk_go_request_attempt_duration_seconds`
Latency of AWS SDK Go request attempts

## Leader Election Metrics

### `leader_election_slowpath_total`
Total number of slow path exercised in renewing leader leases. 'name' is the string used to identify the lease. Please make sure to group by name.

### `leader_election_master_status`
Gauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.

2 changes: 1 addition & 1 deletion website/content/en/preview/reference/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf
| CLUSTER_NAME | \-\-cluster-name | [REQUIRED] The kubernetes cluster name for resource discovery.|
| DISABLE_WEBHOOK | \-\-disable-webhook | Disable the admission and validation webhooks|
| ENABLE_PROFILING | \-\-enable-profiling | Enable the profiling on the metric endpoint|
| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift,SpotToSpotConsolidation (default = Drift=true,SpotToSpotConsolidation=false)|
| FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: SpotToSpotConsolidation (default = SpotToSpotConsolidation=false)|
| HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)|
| INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.|
| ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS on-demand pricing endpoint.|
Expand Down

0 comments on commit a3d8ac4

Please sign in to comment.