Skip to content

Commit

Permalink
[metric] GoogleContainerTools#1 Turn metric names to const (GoogleCon…
Browse files Browse the repository at this point in the history
…tainerTools#1234)

Some Config Sync metrics were appended _total suffix in the view file where the metric name finalizes, which creates confusion.

This change puts the naming in one place to reduce confusion.
  • Loading branch information
tiffanny29631 authored May 29, 2024
1 parent 173db78 commit f012817
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 46 deletions.
59 changes: 45 additions & 14 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,47 @@ package metrics

import "go.opencensus.io/stats"

const (
// APICallDurationName is the name of API duration metric
APICallDurationName = "api_duration_seconds"
// ReconcilerErrorsName is the name of reconciler error count metric
ReconcilerErrorsName = "reconciler_errors"
// PipelineErrorName is the name of pipeline error status metric.
PipelineErrorName = "pipeline_error_observed"
// ReconcileDurationName is the name of reconcile duration metric
ReconcileDurationName = "reconcile_duration_seconds"
// ParserDurationName is the name of parser duration metric
ParserDurationName = "parser_duration_seconds"
// LastSyncName is the name of last sync timestamp metric
LastSyncName = "last_sync_timestamp"
// DeclaredResourcesName is the name of declared resource count metric
DeclaredResourcesName = "declared_resources"
// ApplyOperationsName is the name of apply operations count metric
ApplyOperationsName = "apply_operations_total"
// ApplyDurationName is the name of apply duration metric
ApplyDurationName = "apply_duration_seconds"
// ResourceFightsName is the name of resource fight count metric
ResourceFightsName = "resource_fights_total"
// RemediateDurationName is the name of remediate duration metric
RemediateDurationName = "remediate_duration_seconds"
// LastApplyName is the name of last apply timestamp metric
LastApplyName = "last_apply_timestamp"
// ResourceConflictsName is the name of resource conflict count metric
ResourceConflictsName = "resource_conflicts_total"
// InternalErrorsName is the name of internal error count metric
InternalErrorsName = "internal_errors_total"
)

var (
// APICallDuration metric measures the latency of API server calls.
APICallDuration = stats.Float64(
"api_duration_seconds",
APICallDurationName,
"The duration of API server calls in seconds",
stats.UnitSeconds)

// ReconcilerErrors metric measures the number of errors in the reconciler.
ReconcilerErrors = stats.Int64(
"reconciler_errors",
ReconcilerErrorsName,
"The number of errors in the reconciler",
stats.UnitDimensionless)

Expand All @@ -34,73 +65,73 @@ var (
// controller, or the Prometheus exporter will error. b/247516388
// https://github.com/GoogleContainerTools/kpt-resource-group/blob/main/controllers/metrics/metrics.go#L88
PipelineError = stats.Int64(
"pipeline_error_observed",
PipelineErrorName,
"A boolean value indicates if error happened at readiness stage when syncing a commit",
stats.UnitDimensionless)

// ReconcileDuration metric measures the latency of reconcile events.
ReconcileDuration = stats.Float64(
"reconcile_duration_seconds",
ReconcileDurationName,
"The duration of reconcile events in seconds",
stats.UnitSeconds)

// ParserDuration metric measures the latency of the parse-apply-watch loop.
ParserDuration = stats.Float64(
"parser_duration_seconds",
ParserDurationName,
"The duration of the parse-apply-watch loop in seconds",
stats.UnitSeconds)

// LastSync metric measures the timestamp of the latest Git sync.
LastSync = stats.Int64(
"last_sync_timestamp",
LastSyncName,
"The timestamp of the most recent sync from Git",
stats.UnitDimensionless)

// DeclaredResources metric measures the number of declared resources parsed from Git.
DeclaredResources = stats.Int64(
"declared_resources",
DeclaredResourcesName,
"The number of declared resources parsed from Git",
stats.UnitDimensionless)

// ApplyOperations metric measures the number of applier apply events.
ApplyOperations = stats.Int64(
"apply_operations",
ApplyOperationsName,
"The number of operations that have been performed to sync resources to source of truth",
stats.UnitDimensionless)

// ApplyDuration metric measures the latency of applier apply events.
ApplyDuration = stats.Float64(
"apply_duration_seconds",
ApplyDurationName,
"The duration of applier events in seconds",
stats.UnitSeconds)

// ResourceFights metric measures the number of resource fights.
ResourceFights = stats.Int64(
"resource_fights",
ResourceFightsName,
"The number of resources that are being synced too frequently",
stats.UnitDimensionless)

// RemediateDuration metric measures the latency of remediator reconciliation events.
RemediateDuration = stats.Float64(
"remediate_duration_seconds",
RemediateDurationName,
"The duration of remediator reconciliation events",
stats.UnitSeconds)

// LastApply metric measures the timestamp of the most recent applier apply event.
LastApply = stats.Int64(
"last_apply_timestamp",
LastApplyName,
"The timestamp of the most recent applier event",
stats.UnitDimensionless)

// ResourceConflicts metric measures the number of resource conflicts.
ResourceConflicts = stats.Int64(
"resource_conflicts",
ResourceConflictsName,
"The number of resource conflicts resulting from a mismatch between the cached resources and cluster resources",
stats.UnitDimensionless)

// InternalErrors metric measures the number of unexpected internal errors triggered by defensive checks in Config Sync.
InternalErrors = stats.Int64(
"internal_errors",
InternalErrorsName,
"The number of internal errors triggered by Config Sync",
stats.UnitDimensionless)
)
28 changes: 14 additions & 14 deletions pkg/metrics/views.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var longDistributionBounds = []float64{1, 5, 10, 30, 60, 300, 600, 1200, 1800, 3
var (
// APICallDurationView aggregates the APICallDuration metric measurements.
APICallDurationView = &view.View{
Name: APICallDuration.Name(),
Name: APICallDurationName,
Measure: APICallDuration,
Description: "The latency distribution of API server calls",
TagKeys: []tag.Key{KeyOperation, KeyStatus},
Expand All @@ -37,7 +37,7 @@ var (

// ReconcilerErrorsView aggregates the ReconcilerErrors metric measurements.
ReconcilerErrorsView = &view.View{
Name: ReconcilerErrors.Name(),
Name: ReconcilerErrorsName,
Measure: ReconcilerErrors,
Description: "The current number of errors in the RootSync and RepoSync reconcilers",
TagKeys: []tag.Key{KeyComponent, KeyErrorClass},
Expand All @@ -49,7 +49,7 @@ var (
// controller, or the Prometheus exporter will error. b/247516388
// https://github.com/GoogleContainerTools/kpt-resource-group/blob/main/controllers/metrics/views.go#L123
PipelineErrorView = &view.View{
Name: PipelineError.Name(),
Name: PipelineErrorName,
Measure: PipelineError,
Description: "A boolean value indicates if error happened from different stages when syncing a commit",
TagKeys: []tag.Key{KeyName, KeyReconcilerType, KeyComponent},
Expand All @@ -58,7 +58,7 @@ var (

// ReconcileDurationView aggregates the ReconcileDuration metric measurements.
ReconcileDurationView = &view.View{
Name: ReconcileDuration.Name(),
Name: ReconcileDurationName,
Measure: ReconcileDuration,
Description: "The latency distribution of RootSync and RepoSync reconcile events",
TagKeys: []tag.Key{KeyStatus},
Expand All @@ -67,7 +67,7 @@ var (

// ParserDurationView aggregates the ParserDuration metric measurements.
ParserDurationView = &view.View{
Name: ParserDuration.Name(),
Name: ParserDurationName,
Measure: ParserDuration,
Description: "The latency distribution of the parse-apply-watch loop",
TagKeys: []tag.Key{KeyStatus, KeyTrigger, KeyParserSource},
Expand All @@ -76,7 +76,7 @@ var (

// LastSyncTimestampView aggregates the LastSyncTimestamp metric measurements.
LastSyncTimestampView = &view.View{
Name: LastSync.Name(),
Name: LastSyncName,
Measure: LastSync,
Description: "The timestamp of the most recent sync from Git",
TagKeys: []tag.Key{KeyCommit, KeyStatus},
Expand All @@ -85,7 +85,7 @@ var (

// DeclaredResourcesView aggregates the DeclaredResources metric measurements.
DeclaredResourcesView = &view.View{
Name: DeclaredResources.Name(),
Name: DeclaredResourcesName,
Measure: DeclaredResources,
Description: "The current number of declared resources parsed from Git",
TagKeys: []tag.Key{KeyCommit},
Expand All @@ -94,7 +94,7 @@ var (

// ApplyOperationsView aggregates the ApplyOps metric measurements.
ApplyOperationsView = &view.View{
Name: ApplyOperations.Name() + "_total",
Name: ApplyOperationsName,
Measure: ApplyOperations,
Description: "The total number of operations that have been performed to sync resources to source of truth",
TagKeys: []tag.Key{KeyController, KeyOperation, KeyStatus},
Expand All @@ -103,7 +103,7 @@ var (

// ApplyDurationView aggregates the ApplyDuration metric measurements.
ApplyDurationView = &view.View{
Name: ApplyDuration.Name(),
Name: ApplyDurationName,
Measure: ApplyDuration,
Description: "The latency distribution of applier resource sync events",
TagKeys: []tag.Key{KeyCommit, KeyStatus},
Expand All @@ -112,7 +112,7 @@ var (

// LastApplyTimestampView aggregates the LastApplyTimestamp metric measurements.
LastApplyTimestampView = &view.View{
Name: LastApply.Name(),
Name: LastApplyName,
Measure: LastApply,
Description: "The timestamp of the most recent applier resource sync event",
TagKeys: []tag.Key{KeyCommit, KeyStatus},
Expand All @@ -121,15 +121,15 @@ var (

// ResourceFightsView aggregates the ResourceFights metric measurements.
ResourceFightsView = &view.View{
Name: ResourceFights.Name() + "_total",
Name: ResourceFightsName,
Measure: ResourceFights,
Description: "The total number of resources that are being synced too frequently",
Aggregation: view.Count(),
}

// RemediateDurationView aggregates the RemediateDuration metric measurements.
RemediateDurationView = &view.View{
Name: RemediateDuration.Name(),
Name: RemediateDurationName,
Measure: RemediateDuration,
Description: "The latency distribution of remediator reconciliation events",
TagKeys: []tag.Key{KeyStatus},
Expand All @@ -138,7 +138,7 @@ var (

// ResourceConflictsView aggregates the ResourceConflicts metric measurements.
ResourceConflictsView = &view.View{
Name: ResourceConflicts.Name() + "_total",
Name: ResourceConflictsName,
Measure: ResourceConflicts,
Description: "The total number of resource conflicts resulting from a mismatch between the cached resources and cluster resources",
TagKeys: []tag.Key{KeyCommit},
Expand All @@ -147,7 +147,7 @@ var (

// InternalErrorsView aggregates the InternalErrors metric measurements.
InternalErrorsView = &view.View{
Name: InternalErrors.Name() + "_total",
Name: InternalErrorsName,
Measure: InternalErrors,
Description: "The total number of internal errors triggered by Config Sync",
TagKeys: []tag.Key{KeyInternalErrorSource},
Expand Down
39 changes: 30 additions & 9 deletions pkg/resourcegroup/controllers/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,69 +22,90 @@ import (

const namespace = "resourcegroup"

const (
// RGReconcileDurationName is the name of resource group reconcile duration metric
RGReconcileDurationName = "rg_reconcile_duration_seconds"
// ResourceGroupTotalName is the name of resource group count metric
ResourceGroupTotalName = "resource_group_total"
// ResourceCountName is the name of resource count metric
ResourceCountName = "resource_count"
// ReadyResourceCountName is the name of ready resource count metric
ReadyResourceCountName = "ready_resource_count"
// KCCResourceCountName is the name of KCC resource count metric
KCCResourceCountName = "kcc_resource_count"
// NamespaceCountName is the name of namespace count metric
NamespaceCountName = "resource_ns_count"
// ClusterScopedResourceCountName is the name of cluster scoped resource count metric
ClusterScopedResourceCountName = "cluster_scoped_resource_count"
// CRDCountName is the name of CRD count metric
CRDCountName = "crd_count"
// PipelineErrorName is the name of pipeline error status metric (same as in Config Sync)
PipelineErrorName = "pipeline_error_observed"
)

var (
// ReconcileDuration tracks the time duration in seconds of reconciling
// a ResourceGroup CR by the ResourceGroup controller.
// label `reason`: the `Reason` field of the `Stalled` condition in a ResourceGroup CR.
// reason can be: StartReconciling, FinishReconciling, ComponentFailed, ExceedTimeout.
// This metric should be updated in the ResourceGroup controller.
ReconcileDuration = stats.Float64(
"rg_reconcile_duration_seconds",
RGReconcileDurationName,
"Time duration in seconds of reconciling a ResourceGroup CR by the ResourceGroup controller",
stats.UnitSeconds)

// ResourceGroupTotal tracks the total number of ResourceGroup CRs in a cluster.
// This metric should be updated in the Root controller.
ResourceGroupTotal = stats.Int64(
"resource_group_total",
ResourceGroupTotalName,
"Total number of ResourceGroup CRs in a cluster",
stats.UnitDimensionless)

// ResourceCount tracks the number of resources in a ResourceGroup CR.
// This metric should be updated in the Root controller.
ResourceCount = stats.Int64(
"resource_count",
ResourceCountName,
"The number of resources in a ResourceGroup CR",
stats.UnitDimensionless)

// ReadyResourceCount tracks the number of resources with Current status in a ResourceGroup CR.
// This metric should be updated in the ResourceGroup controller.
ReadyResourceCount = stats.Int64(
"ready_resource_count",
ReadyResourceCountName,
"The number of resources with Current status in a ResourceGroup CR",
stats.UnitDimensionless)

// KCCResourceCount tracks the number of KCC resources in a ResourceGroup CR.
// This metric should be updated in the ResourceGroup controller.
KCCResourceCount = stats.Int64(
"kcc_resource_count",
KCCResourceCountName,
"The number of KCC resources in a ResourceGroup CR",
stats.UnitDimensionless)

// NamespaceCount tracks the number of resource namespaces in a ResourceGroup CR.
// This metric should be updated in the Root controller.
NamespaceCount = stats.Int64(
"resource_ns_count",
NamespaceCountName,
"The number of resource namespaces in a ResourceGroup CR",
stats.UnitDimensionless)

// ClusterScopedResourceCount tracks the number of cluster-scoped resources in a ResourceGroup CR.
// This metric should be updated in the Root controller.
ClusterScopedResourceCount = stats.Int64(
"cluster_scoped_resource_count",
ClusterScopedResourceCountName,
"The number of cluster-scoped resources in a ResourceGroup CR",
stats.UnitDimensionless)

// CRDCount tracks the number of CRDs in a ResourceGroup CR.
// This metric should be updated in the Root controller.
CRDCount = stats.Int64(
"crd_count",
CRDCountName,
"The number of CRDs in a ResourceGroup CR",
stats.UnitDimensionless)

// PipelineError tracks the error that happened when syncing a commit
PipelineError = stats.Int64(
"pipeline_error_observed",
PipelineErrorName,
"A boolean value indicates if error happened at readiness stage when syncing a commit",
stats.UnitDimensionless)
)
Loading

0 comments on commit f012817

Please sign in to comment.