From f012817c63620aeb31b823824c68f7f38335ea22 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Wed, 29 May 2024 17:36:16 +0000 Subject: [PATCH] [metric] #1 Turn metric names to const (#1234) Some Config Sync metrics were appended _total suffix in the view file where the metric name finalizes, which creates confusion. This change puts the naming in one place to reduce confusion. --- pkg/metrics/metrics.go | 59 ++++++++++++++----- pkg/metrics/views.go | 28 ++++----- .../controllers/metrics/metrics.go | 39 +++++++++--- .../controllers/metrics/views.go | 18 +++--- 4 files changed, 98 insertions(+), 46 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 07e9bb2359..3b53606c77 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -16,16 +16,47 @@ package metrics import "go.opencensus.io/stats" +const ( + // APICallDurationName is the name of API duration metric + APICallDurationName = "api_duration_seconds" + // ReconcilerErrorsName is the name of reconciler error count metric + ReconcilerErrorsName = "reconciler_errors" + // PipelineErrorName is the name of pipeline error status metric. + PipelineErrorName = "pipeline_error_observed" + // ReconcileDurationName is the name of reconcile duration metric + ReconcileDurationName = "reconcile_duration_seconds" + // ParserDurationName is the name of parser duration metric + ParserDurationName = "parser_duration_seconds" + // LastSyncName is the name of last sync timestamp metric + LastSyncName = "last_sync_timestamp" + // DeclaredResourcesName is the name of declared resource count metric + DeclaredResourcesName = "declared_resources" + // ApplyOperationsName is the name of apply operations count metric + ApplyOperationsName = "apply_operations_total" + // ApplyDurationName is the name of apply duration metric + ApplyDurationName = "apply_duration_seconds" + // ResourceFightsName is the name of resource fight count metric + ResourceFightsName = "resource_fights_total" + // RemediateDurationName is the name of remediate duration metric + RemediateDurationName = "remediate_duration_seconds" + // LastApplyName is the name of last apply timestamp metric + LastApplyName = "last_apply_timestamp" + // ResourceConflictsName is the name of resource conflict count metric + ResourceConflictsName = "resource_conflicts_total" + // InternalErrorsName is the name of internal error count metric + InternalErrorsName = "internal_errors_total" +) + var ( // APICallDuration metric measures the latency of API server calls. APICallDuration = stats.Float64( - "api_duration_seconds", + APICallDurationName, "The duration of API server calls in seconds", stats.UnitSeconds) // ReconcilerErrors metric measures the number of errors in the reconciler. ReconcilerErrors = stats.Int64( - "reconciler_errors", + ReconcilerErrorsName, "The number of errors in the reconciler", stats.UnitDimensionless) @@ -34,73 +65,73 @@ var ( // controller, or the Prometheus exporter will error. b/247516388 // https://github.com/GoogleContainerTools/kpt-resource-group/blob/main/controllers/metrics/metrics.go#L88 PipelineError = stats.Int64( - "pipeline_error_observed", + PipelineErrorName, "A boolean value indicates if error happened at readiness stage when syncing a commit", stats.UnitDimensionless) // ReconcileDuration metric measures the latency of reconcile events. ReconcileDuration = stats.Float64( - "reconcile_duration_seconds", + ReconcileDurationName, "The duration of reconcile events in seconds", stats.UnitSeconds) // ParserDuration metric measures the latency of the parse-apply-watch loop. ParserDuration = stats.Float64( - "parser_duration_seconds", + ParserDurationName, "The duration of the parse-apply-watch loop in seconds", stats.UnitSeconds) // LastSync metric measures the timestamp of the latest Git sync. LastSync = stats.Int64( - "last_sync_timestamp", + LastSyncName, "The timestamp of the most recent sync from Git", stats.UnitDimensionless) // DeclaredResources metric measures the number of declared resources parsed from Git. DeclaredResources = stats.Int64( - "declared_resources", + DeclaredResourcesName, "The number of declared resources parsed from Git", stats.UnitDimensionless) // ApplyOperations metric measures the number of applier apply events. ApplyOperations = stats.Int64( - "apply_operations", + ApplyOperationsName, "The number of operations that have been performed to sync resources to source of truth", stats.UnitDimensionless) // ApplyDuration metric measures the latency of applier apply events. ApplyDuration = stats.Float64( - "apply_duration_seconds", + ApplyDurationName, "The duration of applier events in seconds", stats.UnitSeconds) // ResourceFights metric measures the number of resource fights. ResourceFights = stats.Int64( - "resource_fights", + ResourceFightsName, "The number of resources that are being synced too frequently", stats.UnitDimensionless) // RemediateDuration metric measures the latency of remediator reconciliation events. RemediateDuration = stats.Float64( - "remediate_duration_seconds", + RemediateDurationName, "The duration of remediator reconciliation events", stats.UnitSeconds) // LastApply metric measures the timestamp of the most recent applier apply event. LastApply = stats.Int64( - "last_apply_timestamp", + LastApplyName, "The timestamp of the most recent applier event", stats.UnitDimensionless) // ResourceConflicts metric measures the number of resource conflicts. ResourceConflicts = stats.Int64( - "resource_conflicts", + ResourceConflictsName, "The number of resource conflicts resulting from a mismatch between the cached resources and cluster resources", stats.UnitDimensionless) // InternalErrors metric measures the number of unexpected internal errors triggered by defensive checks in Config Sync. InternalErrors = stats.Int64( - "internal_errors", + InternalErrorsName, "The number of internal errors triggered by Config Sync", stats.UnitDimensionless) ) diff --git a/pkg/metrics/views.go b/pkg/metrics/views.go index cf3aeaac92..7b5835e1b7 100644 --- a/pkg/metrics/views.go +++ b/pkg/metrics/views.go @@ -28,7 +28,7 @@ var longDistributionBounds = []float64{1, 5, 10, 30, 60, 300, 600, 1200, 1800, 3 var ( // APICallDurationView aggregates the APICallDuration metric measurements. APICallDurationView = &view.View{ - Name: APICallDuration.Name(), + Name: APICallDurationName, Measure: APICallDuration, Description: "The latency distribution of API server calls", TagKeys: []tag.Key{KeyOperation, KeyStatus}, @@ -37,7 +37,7 @@ var ( // ReconcilerErrorsView aggregates the ReconcilerErrors metric measurements. ReconcilerErrorsView = &view.View{ - Name: ReconcilerErrors.Name(), + Name: ReconcilerErrorsName, Measure: ReconcilerErrors, Description: "The current number of errors in the RootSync and RepoSync reconcilers", TagKeys: []tag.Key{KeyComponent, KeyErrorClass}, @@ -49,7 +49,7 @@ var ( // controller, or the Prometheus exporter will error. b/247516388 // https://github.com/GoogleContainerTools/kpt-resource-group/blob/main/controllers/metrics/views.go#L123 PipelineErrorView = &view.View{ - Name: PipelineError.Name(), + Name: PipelineErrorName, Measure: PipelineError, Description: "A boolean value indicates if error happened from different stages when syncing a commit", TagKeys: []tag.Key{KeyName, KeyReconcilerType, KeyComponent}, @@ -58,7 +58,7 @@ var ( // ReconcileDurationView aggregates the ReconcileDuration metric measurements. ReconcileDurationView = &view.View{ - Name: ReconcileDuration.Name(), + Name: ReconcileDurationName, Measure: ReconcileDuration, Description: "The latency distribution of RootSync and RepoSync reconcile events", TagKeys: []tag.Key{KeyStatus}, @@ -67,7 +67,7 @@ var ( // ParserDurationView aggregates the ParserDuration metric measurements. ParserDurationView = &view.View{ - Name: ParserDuration.Name(), + Name: ParserDurationName, Measure: ParserDuration, Description: "The latency distribution of the parse-apply-watch loop", TagKeys: []tag.Key{KeyStatus, KeyTrigger, KeyParserSource}, @@ -76,7 +76,7 @@ var ( // LastSyncTimestampView aggregates the LastSyncTimestamp metric measurements. LastSyncTimestampView = &view.View{ - Name: LastSync.Name(), + Name: LastSyncName, Measure: LastSync, Description: "The timestamp of the most recent sync from Git", TagKeys: []tag.Key{KeyCommit, KeyStatus}, @@ -85,7 +85,7 @@ var ( // DeclaredResourcesView aggregates the DeclaredResources metric measurements. DeclaredResourcesView = &view.View{ - Name: DeclaredResources.Name(), + Name: DeclaredResourcesName, Measure: DeclaredResources, Description: "The current number of declared resources parsed from Git", TagKeys: []tag.Key{KeyCommit}, @@ -94,7 +94,7 @@ var ( // ApplyOperationsView aggregates the ApplyOps metric measurements. ApplyOperationsView = &view.View{ - Name: ApplyOperations.Name() + "_total", + Name: ApplyOperationsName, Measure: ApplyOperations, Description: "The total number of operations that have been performed to sync resources to source of truth", TagKeys: []tag.Key{KeyController, KeyOperation, KeyStatus}, @@ -103,7 +103,7 @@ var ( // ApplyDurationView aggregates the ApplyDuration metric measurements. ApplyDurationView = &view.View{ - Name: ApplyDuration.Name(), + Name: ApplyDurationName, Measure: ApplyDuration, Description: "The latency distribution of applier resource sync events", TagKeys: []tag.Key{KeyCommit, KeyStatus}, @@ -112,7 +112,7 @@ var ( // LastApplyTimestampView aggregates the LastApplyTimestamp metric measurements. LastApplyTimestampView = &view.View{ - Name: LastApply.Name(), + Name: LastApplyName, Measure: LastApply, Description: "The timestamp of the most recent applier resource sync event", TagKeys: []tag.Key{KeyCommit, KeyStatus}, @@ -121,7 +121,7 @@ var ( // ResourceFightsView aggregates the ResourceFights metric measurements. ResourceFightsView = &view.View{ - Name: ResourceFights.Name() + "_total", + Name: ResourceFightsName, Measure: ResourceFights, Description: "The total number of resources that are being synced too frequently", Aggregation: view.Count(), @@ -129,7 +129,7 @@ var ( // RemediateDurationView aggregates the RemediateDuration metric measurements. RemediateDurationView = &view.View{ - Name: RemediateDuration.Name(), + Name: RemediateDurationName, Measure: RemediateDuration, Description: "The latency distribution of remediator reconciliation events", TagKeys: []tag.Key{KeyStatus}, @@ -138,7 +138,7 @@ var ( // ResourceConflictsView aggregates the ResourceConflicts metric measurements. ResourceConflictsView = &view.View{ - Name: ResourceConflicts.Name() + "_total", + Name: ResourceConflictsName, Measure: ResourceConflicts, Description: "The total number of resource conflicts resulting from a mismatch between the cached resources and cluster resources", TagKeys: []tag.Key{KeyCommit}, @@ -147,7 +147,7 @@ var ( // InternalErrorsView aggregates the InternalErrors metric measurements. InternalErrorsView = &view.View{ - Name: InternalErrors.Name() + "_total", + Name: InternalErrorsName, Measure: InternalErrors, Description: "The total number of internal errors triggered by Config Sync", TagKeys: []tag.Key{KeyInternalErrorSource}, diff --git a/pkg/resourcegroup/controllers/metrics/metrics.go b/pkg/resourcegroup/controllers/metrics/metrics.go index 455348de72..a48f02cc8d 100644 --- a/pkg/resourcegroup/controllers/metrics/metrics.go +++ b/pkg/resourcegroup/controllers/metrics/metrics.go @@ -22,6 +22,27 @@ import ( const namespace = "resourcegroup" +const ( + // RGReconcileDurationName is the name of resource group reconcile duration metric + RGReconcileDurationName = "rg_reconcile_duration_seconds" + // ResourceGroupTotalName is the name of resource group count metric + ResourceGroupTotalName = "resource_group_total" + // ResourceCountName is the name of resource count metric + ResourceCountName = "resource_count" + // ReadyResourceCountName is the name of ready resource count metric + ReadyResourceCountName = "ready_resource_count" + // KCCResourceCountName is the name of KCC resource count metric + KCCResourceCountName = "kcc_resource_count" + // NamespaceCountName is the name of namespace count metric + NamespaceCountName = "resource_ns_count" + // ClusterScopedResourceCountName is the name of cluster scoped resource count metric + ClusterScopedResourceCountName = "cluster_scoped_resource_count" + // CRDCountName is the name of CRD count metric + CRDCountName = "crd_count" + // PipelineErrorName is the name of pipeline error status metric (same as in Config Sync) + PipelineErrorName = "pipeline_error_observed" +) + var ( // ReconcileDuration tracks the time duration in seconds of reconciling // a ResourceGroup CR by the ResourceGroup controller. @@ -29,62 +50,62 @@ var ( // reason can be: StartReconciling, FinishReconciling, ComponentFailed, ExceedTimeout. // This metric should be updated in the ResourceGroup controller. ReconcileDuration = stats.Float64( - "rg_reconcile_duration_seconds", + RGReconcileDurationName, "Time duration in seconds of reconciling a ResourceGroup CR by the ResourceGroup controller", stats.UnitSeconds) // ResourceGroupTotal tracks the total number of ResourceGroup CRs in a cluster. // This metric should be updated in the Root controller. ResourceGroupTotal = stats.Int64( - "resource_group_total", + ResourceGroupTotalName, "Total number of ResourceGroup CRs in a cluster", stats.UnitDimensionless) // ResourceCount tracks the number of resources in a ResourceGroup CR. // This metric should be updated in the Root controller. ResourceCount = stats.Int64( - "resource_count", + ResourceCountName, "The number of resources in a ResourceGroup CR", stats.UnitDimensionless) // ReadyResourceCount tracks the number of resources with Current status in a ResourceGroup CR. // This metric should be updated in the ResourceGroup controller. ReadyResourceCount = stats.Int64( - "ready_resource_count", + ReadyResourceCountName, "The number of resources with Current status in a ResourceGroup CR", stats.UnitDimensionless) // KCCResourceCount tracks the number of KCC resources in a ResourceGroup CR. // This metric should be updated in the ResourceGroup controller. KCCResourceCount = stats.Int64( - "kcc_resource_count", + KCCResourceCountName, "The number of KCC resources in a ResourceGroup CR", stats.UnitDimensionless) // NamespaceCount tracks the number of resource namespaces in a ResourceGroup CR. // This metric should be updated in the Root controller. NamespaceCount = stats.Int64( - "resource_ns_count", + NamespaceCountName, "The number of resource namespaces in a ResourceGroup CR", stats.UnitDimensionless) // ClusterScopedResourceCount tracks the number of cluster-scoped resources in a ResourceGroup CR. // This metric should be updated in the Root controller. ClusterScopedResourceCount = stats.Int64( - "cluster_scoped_resource_count", + ClusterScopedResourceCountName, "The number of cluster-scoped resources in a ResourceGroup CR", stats.UnitDimensionless) // CRDCount tracks the number of CRDs in a ResourceGroup CR. // This metric should be updated in the Root controller. CRDCount = stats.Int64( - "crd_count", + CRDCountName, "The number of CRDs in a ResourceGroup CR", stats.UnitDimensionless) // PipelineError tracks the error that happened when syncing a commit PipelineError = stats.Int64( - "pipeline_error_observed", + PipelineErrorName, "A boolean value indicates if error happened at readiness stage when syncing a commit", stats.UnitDimensionless) ) diff --git a/pkg/resourcegroup/controllers/metrics/views.go b/pkg/resourcegroup/controllers/metrics/views.go index 7ef4dfc379..39643e13b1 100644 --- a/pkg/resourcegroup/controllers/metrics/views.go +++ b/pkg/resourcegroup/controllers/metrics/views.go @@ -22,7 +22,7 @@ import ( var ( // ReconcileDurationView aggregates the ReconcileDuration metric measurements. ReconcileDurationView = &view.View{ - Name: ReconcileDuration.Name(), + Name: RGReconcileDurationName, Measure: ReconcileDuration, Description: "The distribution of time taken to reconcile a ResourceGroup CR", TagKeys: []tag.Key{KeyStallReason}, @@ -31,7 +31,7 @@ var ( // ResourceGroupTotalView aggregates the ResourceGroupTotal metric measurements. ResourceGroupTotalView = &view.View{ - Name: ResourceGroupTotal.Name(), + Name: ResourceGroupTotalName, Measure: ResourceGroupTotal, Description: "The current number of ResourceGroup CRs", Aggregation: view.LastValue(), @@ -39,7 +39,7 @@ var ( // ResourceCountView aggregates the ResourceCount metric measurements. ResourceCountView = &view.View{ - Name: ResourceCount.Name(), + Name: ResourceCountName, Measure: ResourceCount, Description: "The total number of resources tracked by a ResourceGroup", TagKeys: []tag.Key{KeyResourceGroup}, @@ -48,7 +48,7 @@ var ( // ReadyResourceCountView aggregates the resources ready in a ResourceGroup ReadyResourceCountView = &view.View{ - Name: ReadyResourceCount.Name(), + Name: ReadyResourceCountName, Measure: ReadyResourceCount, Description: "The total number of ready resources in a ResourceGroup", TagKeys: []tag.Key{KeyResourceGroup}, @@ -57,7 +57,7 @@ var ( // NamespaceCountView counts number of namespaces in a ResourceGroup NamespaceCountView = &view.View{ - Name: NamespaceCount.Name(), + Name: NamespaceCountName, Measure: NamespaceCount, Description: "The number of namespaces used by resources in a ResourceGroup", TagKeys: []tag.Key{KeyResourceGroup}, @@ -66,7 +66,7 @@ var ( // ClusterScopedResourceCountView counts number of namespaces in a ResourceGroup ClusterScopedResourceCountView = &view.View{ - Name: ClusterScopedResourceCount.Name(), + Name: ClusterScopedResourceCountName, Measure: ClusterScopedResourceCount, Description: "The number of cluster scoped resources in a ResourceGroup", TagKeys: []tag.Key{KeyResourceGroup}, @@ -75,7 +75,7 @@ var ( // CRDCountView counts number of namespaces in a ResourceGroup CRDCountView = &view.View{ - Name: CRDCount.Name(), + Name: CRDCountName, Measure: CRDCount, Description: "The number of CRDs in a ResourceGroup", TagKeys: []tag.Key{KeyResourceGroup}, @@ -84,7 +84,7 @@ var ( // KCCResourceCountView aggregates the KCC resources in a ResourceGroup KCCResourceCountView = &view.View{ - Name: KCCResourceCount.Name(), + Name: KCCResourceCountName, Measure: KCCResourceCount, Description: "The total number of KCC resources in a ResourceGroup", TagKeys: []tag.Key{KeyResourceGroup}, @@ -94,7 +94,7 @@ var ( // PipelineErrorView aggregates the PipelineError by components // TODO: add link to same metric in Config Sync under pkg/metrics/views.go PipelineErrorView = &view.View{ - Name: PipelineError.Name(), + Name: PipelineErrorName, Measure: PipelineError, Description: "A boolean value indicates if error happened from different stages when syncing a commit", TagKeys: []tag.Key{KeyName, KeyComponent, KeyType},