From 8de3f0a4ee1e15b3793c0df47df06de7176cd96d Mon Sep 17 00:00:00 2001 From: Aditya Thebe Date: Fri, 18 Oct 2024 16:02:13 +0545 Subject: [PATCH] feat: accept additional check labels for metrics from the CLI --- api/v1/checks.go | 4 + cmd/root.go | 90 +++++++++++++++++---- pkg/metrics/metrics.go | 180 ++++++++++++++++++++++------------------- 3 files changed, 179 insertions(+), 95 deletions(-) diff --git a/api/v1/checks.go b/api/v1/checks.go index 573708d46..153692817 100644 --- a/api/v1/checks.go +++ b/api/v1/checks.go @@ -17,6 +17,10 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) +// List of additional check label keys that should be included in the check metrics. +// By default the labels metrics are not exposed. +var AdditionalCheckMetricLabels []string + const ( OnTransformMarkHealthy = "MarkHealthy" OnTransformMarkUnhealthy = "MarkUnhealthy" diff --git a/cmd/root.go b/cmd/root.go index 8632d0486..9c25f7510 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -6,6 +6,7 @@ import ( "os/signal" "time" + v1 "github.com/flanksource/canary-checker/api/v1" "github.com/flanksource/canary-checker/checks" "github.com/flanksource/canary-checker/pkg/jobs/canary" "github.com/flanksource/canary-checker/pkg/prometheus" @@ -67,7 +68,10 @@ var Root = &cobra.Command{ } if prometheus.PrometheusURL != "" { logger.Infof("Setting default prometheus: %s", prometheus.PrometheusURL) - runner.Prometheus, _ = prometheus.NewPrometheusAPI(context.New(), connection.HTTPConnection{URL: prometheus.PrometheusURL}) + runner.Prometheus, _ = prometheus.NewPrometheusAPI( + context.New(), + connection.HTTPConnection{URL: prometheus.PrometheusURL}, + ) } go func() { @@ -116,7 +120,11 @@ func deprecatedFlags(flags *pflag.FlagSet) { panic(err) } - _ = flags.Bool("expose-env", false, "Expose environment variables for use in all templates. Note this has serious security implications with untrusted canaries") + _ = flags.Bool( + "expose-env", + false, + "Expose environment variables for use in all templates. Note this has serious security implications with untrusted canaries", + ) if err := flags.MarkDeprecated("expose-env", "the flag used to be a no-op"); err != nil { panic(err) } @@ -155,24 +163,75 @@ func deprecatedFlags(flags *pflag.FlagSet) { func ServerFlags(flags *pflag.FlagSet) { flags.IntVar(&httpPort, "httpPort", httpPort, "Port to expose a health dashboard ") - flags.StringVar(&publicEndpoint, "public-endpoint", publicEndpoint, "Host on which the health dashboard is exposed. Could be used for generting-links, redirects etc.") + flags.StringSliceVar(&v1.AdditionalCheckMetricLabels, + "metric-labels-allowlist", + nil, + "comma-separated list of additional check label keys that should be included in the check metrics", + ) + + flags.StringVar( + &publicEndpoint, + "public-endpoint", + publicEndpoint, + "Host on which the health dashboard is exposed. Could be used for generting-links, redirects etc.", + ) flags.StringVar(&runner.RunnerName, "name", "local", "Server name shown in aggregate dashboard") - flags.StringSliceVar(&runner.IncludeCanaries, "include-check", []string{}, "(Deprecated: use --include-canary) Run matching canaries - useful for debugging") - flags.StringSliceVar(&runner.IncludeCanaries, "include-canary", []string{}, "Only run canaries matching the given names") - flags.StringSliceVar(&runner.IncludeLabels, "include-labels", nil, "Only run canaries matching the given label selector") - flags.StringSliceVar(&runner.IncludeNamespaces, "include-namespace", []string{}, "a comma separated list of namespaces whose canary should be run") + flags.StringSliceVar( + &runner.IncludeCanaries, + "include-check", + []string{}, + "(Deprecated: use --include-canary) Run matching canaries - useful for debugging", + ) + flags.StringSliceVar( + &runner.IncludeCanaries, + "include-canary", + []string{}, + "Only run canaries matching the given names", + ) + flags.StringSliceVar( + &runner.IncludeLabels, + "include-labels", + nil, + "Only run canaries matching the given label selector", + ) + flags.StringSliceVar( + &runner.IncludeNamespaces, + "include-namespace", + []string{}, + "a comma separated list of namespaces whose canary should be run", + ) flags.StringVarP(&query.DefaultCheckQueryWindow, "default-window", "", "1h", "Default search window") - flags.StringVar(&checks.DefaultArtifactConnection, "artifact-connection", "", "Specify the default connection to use for artifacts") + flags.StringVar( + &checks.DefaultArtifactConnection, + "artifact-connection", + "", + "Specify the default connection to use for artifacts", + ) flags.IntVar(&canary.ReconcilePageSize, "upstream-page-size", 500, "upstream reconciliation page size") flags.DurationVar(&canary.ReconcileMaxAge, "upstream-max-age", time.Hour*48, "upstream reconciliation max age") - flags.StringVar(&canary.UpstreamConf.Host, "upstream-host", os.Getenv("UPSTREAM_HOST"), "central canary checker instance to push/pull canaries") + flags.StringVar( + &canary.UpstreamConf.Host, + "upstream-host", + os.Getenv("UPSTREAM_HOST"), + "central canary checker instance to push/pull canaries", + ) flags.StringVar(&canary.UpstreamConf.Username, "upstream-user", os.Getenv("UPSTREAM_USER"), "upstream username") - flags.StringVar(&canary.UpstreamConf.Password, "upstream-password", os.Getenv("UPSTREAM_PASSWORD"), "upstream password") + flags.StringVar( + &canary.UpstreamConf.Password, + "upstream-password", + os.Getenv("UPSTREAM_PASSWORD"), + "upstream password", + ) flags.StringVar(&canary.UpstreamConf.AgentName, "agent-name", os.Getenv("AGENT_NAME"), "name of this agent") - flags.BoolVar(&canary.UpstreamConf.InsecureSkipVerify, "upstream-insecure-skip-verify", os.Getenv("UPSTREAM_INSECURE_SKIP_VERIFY") == "true", "Skip TLS verification on the upstream servers certificate") + flags.BoolVar( + &canary.UpstreamConf.InsecureSkipVerify, + "upstream-insecure-skip-verify", + os.Getenv("UPSTREAM_INSECURE_SKIP_VERIFY") == "true", + "Skip TLS verification on the upstream servers certificate", + ) duty.BindPFlags(flags, duty.SkipMigrationByDefaultMode) @@ -185,9 +244,12 @@ func init() { logger.UseSlog() Root.PersistentFlags().BoolVar(&logFail, "log-fail", false, "Log every failing check") Root.PersistentFlags().BoolVar(&logPass, "log-pass", false, "Log every passing check") - Root.PersistentFlags().StringVar(&otelcollectorURL, "otel-collector-url", "", "OpenTelemetry gRPC Collector URL in host:port format") - Root.PersistentFlags().StringVar(&otelServiceName, "otel-service-name", "canary-checker", "OpenTelemetry service name for the resource") - Root.PersistentFlags().StringVar(&prometheus.PrometheusURL, "prometheus", "", "URL of the prometheus server that is scraping this instance") + Root.PersistentFlags(). + StringVar(&otelcollectorURL, "otel-collector-url", "", "OpenTelemetry gRPC Collector URL in host:port format") + Root.PersistentFlags(). + StringVar(&otelServiceName, "otel-service-name", "canary-checker", "OpenTelemetry service name for the resource") + Root.PersistentFlags(). + StringVar(&prometheus.PrometheusURL, "prometheus", "", "URL of the prometheus server that is scraping this instance") Root.AddCommand(Docs) Root.AddCommand(Run, Serve, Operator) Root.AddCommand(Serve, GoOffline) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 0c63e7faa..948809e8b 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -1,16 +1,13 @@ package metrics import ( - "fmt" - "sort" - "strings" + "slices" "time" "github.com/asecurityteam/rolling" v1 "github.com/flanksource/canary-checker/api/v1" "github.com/flanksource/canary-checker/pkg" "github.com/flanksource/canary-checker/pkg/runner" - "github.com/flanksource/commons/collections" "github.com/flanksource/duty/context" "github.com/flanksource/duty/types" cmap "github.com/orcaman/concurrent-map" @@ -18,22 +15,32 @@ import ( "github.com/samber/lo" ) -var ( - CounterType pkg.MetricType = "counter" - GaugeType pkg.MetricType = "gauge" - HistogramType pkg.MetricType = "histogram" +func init() { + CustomCounters = make(map[string]*prometheus.CounterVec) + CustomGauges = make(map[string]*prometheus.GaugeVec) + CustomHistograms = make(map[string]*prometheus.HistogramVec) - CustomGauges map[string]*prometheus.GaugeVec - CustomCounters map[string]*prometheus.CounterVec - CustomHistograms map[string]*prometheus.HistogramVec + // Register the metrics with a delay because + // v1.AdditionalCheckMetricLabels is nil during init. + go func() { + time.Sleep(time.Second) + slices.Sort(v1.AdditionalCheckMetricLabels) + setupMetrics() + }() +} + +func setupMetrics() { RequestLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "canary_check_duration", Help: "A histogram of the response latency in milliseconds.", Buckets: []float64{5, 10, 25, 50, 200, 500, 1000, 3000, 10000, 30000}, }, - []string{"type", "endpoint", "canary_name", "canary_namespace", "owner", "severity", "key", "name"}, + append( + []string{"type", "endpoint", "canary_name", "canary_namespace", "owner", "severity", "key", "name"}, + v1.AdditionalCheckMetricLabels..., + ), ) Gauge = prometheus.NewGaugeVec( @@ -41,33 +48,65 @@ var ( Name: "canary_check", Help: "A gauge representing the canaries success (0) or failure (1)", }, - []string{"key", "type", "canary_name", "canary_namespace", "name"}, + append([]string{"key", "type", "canary_name", "canary_namespace", "name"}, v1.AdditionalCheckMetricLabels...), + ) + + checkLabels := []string{"type", "endpoint", "canary_name", "canary_namespace", "owner", "severity", "key", "name"} + checkLabels = append(checkLabels, v1.AdditionalCheckMetricLabels...) + + OpsCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "canary_check_count", + Help: "The total number of checks", + }, + checkLabels, ) - GenericGauge = prometheus.NewGaugeVec( + CanaryCheckInfo = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "canary_check_gauge", - Help: "A gauge representing duration", + Name: "canary_check_info", + Help: "Information about the canary check", }, - []string{"type", "canary_name", "metric", "canary_namespace", "owner", "severity", "key", "name"}, + checkLabels, ) - GenericCounter = prometheus.NewCounterVec( + OpsSuccessCount = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "canary_check_counter", - Help: "A gauge representing counters", + Name: "canary_check_success_count", + Help: "The total number of successful checks", }, - []string{"type", "canary_name", "metric", "value", "canary_namespace", "owner", "severity", "key", "name"}, + checkLabels, ) - GenericHistogram = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "canary_check_histogram", - Help: "A histogram representing durations", - Buckets: []float64{5, 10, 25, 50, 200, 500, 1000, 2500, 5000, 10000, 20000}, + OpsFailedCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "canary_check_failed_count", + Help: "The total number of failed checks", }, - []string{"type", "canary_name", "metric", "canary_namespace", "owner", "severity", "key", "name"}, + checkLabels, ) + + prometheus.MustRegister(Gauge, CanaryCheckInfo, OpsCount, OpsSuccessCount, OpsFailedCount, RequestLatency) +} + +var ( + CounterType pkg.MetricType = "counter" + GaugeType pkg.MetricType = "gauge" + HistogramType pkg.MetricType = "histogram" + + CustomGauges map[string]*prometheus.GaugeVec + CustomCounters map[string]*prometheus.CounterVec + CustomHistograms map[string]*prometheus.HistogramVec + + // Global metrics + CanaryCheckInfo *prometheus.GaugeVec + Gauge *prometheus.GaugeVec + + // Check specific metrics + OpsCount *prometheus.CounterVec + OpsFailedCount *prometheus.CounterVec + OpsSuccessCount *prometheus.CounterVec + RequestLatency *prometheus.HistogramVec ) var ( @@ -76,13 +115,6 @@ var ( latencies = cmap.New() ) -func init() { - prometheus.MustRegister(Gauge, RequestLatency, GenericGauge, GenericCounter, GenericHistogram) - CustomCounters = make(map[string]*prometheus.CounterVec) - CustomGauges = make(map[string]*prometheus.GaugeVec) - CustomHistograms = make(map[string]*prometheus.HistogramVec) -} - func RemoveCheck(checks v1.Canary) { for _, check := range checks.Spec.GetAllChecks() { key := checks.GetKey(check) @@ -116,13 +148,18 @@ func GetMetrics(key string) (uptime types.Uptime, latency types.Latency) { return } -func Record(ctx context.Context, canary v1.Canary, result *pkg.CheckResult) (_uptime types.Uptime, _latency types.Latency) { +func Record( + ctx context.Context, + canary v1.Canary, + result *pkg.CheckResult, +) (_uptime types.Uptime, _latency types.Latency) { defer func() { e := recover() if e != nil { ctx.Errorf("panic recording metrics for %s ==> %s", result, e) } }() + if result == nil || result.Check == nil { ctx.Warnf("returned a nil result") return _uptime, _latency @@ -170,33 +207,38 @@ func Record(ctx context.Context, canary v1.Canary, result *pkg.CheckResult) (_up latency = _latencyV.(*rolling.TimePolicy) } - metricSuffix, keyValuePairs := metricLabels(result.Check.GetLabels()) - labels := append([]string{ - "type", checkType, - "endpoint", endpoint, - "canary_name", canaryName, - "canary_namespace", canaryNamespace, - "owner", owner, - "severity", severity, - "key", key, - "name", name, - }, keyValuePairs...) - - ctx.Counter(metricName("canary_check_count", metricSuffix), labels...).Add(1) + var additionalLabels []string + for _, key := range v1.AdditionalCheckMetricLabels { + if v, ok := result.Check.GetLabels()[key]; ok { + additionalLabels = append(additionalLabels, v) + } else { + // just insert an empty value + additionalLabels = append(additionalLabels, "") + } + } + + checkMetricLabels := append( + []string{checkType, endpoint, canaryName, canaryNamespace, owner, severity, key, name}, + additionalLabels...) + + OpsCount.WithLabelValues(checkMetricLabels...).Inc() + if result.Duration > 0 { - RequestLatency.WithLabelValues(checkType, endpoint, canaryName, canaryNamespace, owner, severity, key, name).Observe(float64(result.Duration)) + RequestLatency.WithLabelValues(checkMetricLabels...).Observe(float64(result.Duration)) latency.Append(float64(result.Duration)) } + gaugeLabels := append([]string{key, checkType, canaryName, canaryNamespace, name}, v1.AdditionalCheckMetricLabels...) + if result.Pass { pass.Append(1) - Gauge.WithLabelValues(key, checkType, canaryName, canaryNamespace, name).Set(0) - ctx.Gauge(metricName("canary_check_info", metricSuffix), labels...).Set(0) + Gauge.WithLabelValues(gaugeLabels...).Set(0) - ctx.Counter(metricName("canary_check_success_count", metricSuffix), labels...).Add(1) + CanaryCheckInfo.WithLabelValues(checkMetricLabels...).Set(0) + OpsSuccessCount.WithLabelValues(checkMetricLabels...).Inc() // always add a failed count to ensure the metric is present in prometheus // for an uptime calculation - ctx.Counter(metricName("canary_check_failed_count", metricSuffix), labels...).Add(0) + OpsFailedCount.WithLabelValues(checkMetricLabels...).Add(0) for _, m := range result.Metrics { switch m.Type { @@ -218,9 +260,10 @@ func Record(ctx context.Context, canary v1.Canary, result *pkg.CheckResult) (_up } } else { fail.Append(1) - Gauge.WithLabelValues(key, checkType, canaryName, canaryNamespace, name).Set(1) - ctx.Gauge(metricName("canary_check_info", metricSuffix), labels...).Set(1) - ctx.Counter(metricName("canary_check_failed_count", metricSuffix), labels...).Add(1) + Gauge.WithLabelValues(gaugeLabels...).Set(1) + + CanaryCheckInfo.WithLabelValues(checkMetricLabels...).Set(1) + OpsFailedCount.WithLabelValues(checkMetricLabels...).Inc() } _uptime = types.Uptime{Passed: int(pass.Reduce(rolling.Sum)), Failed: int(fail.Reduce(rolling.Sum))} @@ -232,31 +275,6 @@ func Record(ctx context.Context, canary v1.Canary, result *pkg.CheckResult) (_up return _uptime, _latency } -func metricName(metric, sortedLabelKeys string) string { - if sortedLabelKeys == "" { - return metric - } - - return fmt.Sprintf("%s_%s", metric, sortedLabelKeys) -} - -func metricLabels(m map[string]string) (string, []string) { - if len(m) == 0 { - return "", nil - } - - keys := collections.MapKeys(m) - sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] }) - - var sortedKeyValue []string - for _, k := range keys { - sortedKeyValue = append(sortedKeyValue, k) - sortedKeyValue = append(sortedKeyValue, m[k]) - } - - return strings.Join(keys, "_"), sortedKeyValue -} - func getOrCreateGauge(m pkg.Metric) error { var gauge *prometheus.GaugeVec var ok bool