Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Add clustercache metrics #11789

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions controllers/clustercache/cluster_accessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,10 @@ func (ca *clusterAccessor) Connect(ctx context.Context) (retErr error) {
defer func() {
if retErr != nil {
log.Error(retErr, "Connect failed")
connectionUp.WithLabelValues(ca.cluster.String()).Set(0)
ca.lockedState.lastConnectionCreationErrorTimestamp = time.Now()
} else {
connectionUp.WithLabelValues(ca.cluster.String()).Set(1)
}
}()

Expand Down Expand Up @@ -298,15 +301,17 @@ func (ca *clusterAccessor) Connect(ctx context.Context) (retErr error) {
// Disconnect disconnects a connection to the workload cluster.
func (ca *clusterAccessor) Disconnect(ctx context.Context) {
log := ctrl.LoggerFrom(ctx)

if !ca.Connected(ctx) {
log.V(6).Info("Skipping disconnect, already disconnected")
return
}

ca.lock(ctx)
defer ca.unlock(ctx)

defer func() {
ca.unlock(ctx)
connectionUp.WithLabelValues(ca.cluster.String()).Set(0)
}()
log.Info("Disconnecting")

// Stopping the cache is non-blocking, so it's okay to do it while holding the lock.
Expand Down Expand Up @@ -351,14 +356,20 @@ func (ca *clusterAccessor) HealthCheck(ctx context.Context) (bool, bool) {
unauthorizedErrorOccurred = true
ca.lockedState.healthChecking.consecutiveFailures++
log.V(6).Info(fmt.Sprintf("Health probe failed (unauthorized error occurred): %v", err))
healthCheck.WithLabelValues(ca.cluster.String()).Set(0)
healthChecksTotal.WithLabelValues(ca.cluster.String(), "error").Inc()
case err != nil:
ca.lockedState.healthChecking.consecutiveFailures++
log.V(6).Info(fmt.Sprintf("Health probe failed (%d/%d): %v",
ca.lockedState.healthChecking.consecutiveFailures, ca.config.HealthProbe.FailureThreshold, err))
healthCheck.WithLabelValues(ca.cluster.String()).Set(0)
healthChecksTotal.WithLabelValues(ca.cluster.String(), "error").Inc()
default:
ca.lockedState.healthChecking.consecutiveFailures = 0
ca.lockedState.healthChecking.lastProbeSuccessTimestamp = ca.lockedState.healthChecking.lastProbeTimestamp
log.V(6).Info("Health probe succeeded")
healthCheck.WithLabelValues(ca.cluster.String()).Set(1)
healthChecksTotal.WithLabelValues(ca.cluster.String(), "success").Inc()
}

tooManyConsecutiveFailures := ca.lockedState.healthChecking.consecutiveFailures >= ca.config.HealthProbe.FailureThreshold
Expand Down
56 changes: 56 additions & 0 deletions controllers/clustercache/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
Copyright 2024 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package clustercache

import (
"github.com/prometheus/client_golang/prometheus"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

func init() {
// Register the metrics at the controller-runtime metrics registry.
ctrlmetrics.Registry.MustRegister(healthCheck)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also have to register connectionUp

ctrlmetrics.Registry.MustRegister(connectionUp)
ctrlmetrics.Registry.MustRegister(healthChecksTotal)
}

var (
healthCheck = prometheus.NewGaugeVec(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should also have a _total metric for healthchecks. Could be useful to have a counter to see how often the health check succeeded/failed

prometheus.GaugeOpts{
Name: "capi_cluster_cache_health_check",
Help: "Result of the last clustercache healthcheck for a cluster.",
}, []string{
"cluster",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know if there's some sort of standard/convention on how to encode namespace + name into metric labels?

If there is, would be nice to follow it to make it easier to correlate / join metrics

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point! will take a look into this

},
)
healthChecksTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "capi_cluster_cache_health_checks_total",
Help: "Results of all clustercache healthchecks.",
}, []string{
"cluster", "status",
},
)
connectionUp = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "capi_cluster_cache_connection_up",
Help: "Whether the connection to the cluster is up.",
}, []string{
"cluster",
},
)
)