From 8a8265bc9954d64fe06e7fa1c297b7813d4d5f15 Mon Sep 17 00:00:00 2001 From: Clement Erena Date: Tue, 5 Nov 2024 15:56:03 +0100 Subject: [PATCH] feat(observability-lib): add alerts for keystone workflow --- .../keystone-workflows/component.go | 235 ++++++++--- .../keystone-workflows/component_test.go | 2 +- .../dashboards/keystone-workflows/platform.go | 31 +- .../keystone-workflows/test-output.json | 365 +++++++++++++++--- 4 files changed, 481 insertions(+), 152 deletions(-) diff --git a/observability-lib/dashboards/keystone-workflows/component.go b/observability-lib/dashboards/keystone-workflows/component.go index bf242ef17..d358de4c3 100644 --- a/observability-lib/dashboards/keystone-workflows/component.go +++ b/observability-lib/dashboards/keystone-workflows/component.go @@ -4,14 +4,20 @@ import ( "github.com/grafana/grafana-foundation-sdk/go/alerting" "github.com/grafana/grafana-foundation-sdk/go/cog" "github.com/grafana/grafana-foundation-sdk/go/dashboard" + "github.com/grafana/grafana-foundation-sdk/go/expr" "github.com/smartcontractkit/chainlink-common/observability-lib/grafana" ) -func NewDashboard(props *Props) (*grafana.Dashboard, error) { - if err := platformBuildOpts(props); err != nil { +func NewDashboard(props *Props) (*grafana.Observability, error) { + if err := validateInput(props); err != nil { return nil, err } + props.AlertsTitlePrefix = "[Keystone]" + props.QueryFilters = `env=~"${env}", cluster=~"${cluster}"` + props.AlertsTags = map[string]string{ + "team": "keystone", + } builder := grafana.NewBuilder(&grafana.BuilderOptions{ Name: props.Name, @@ -24,8 +30,36 @@ func NewDashboard(props *Props) (*grafana.Dashboard, error) { builder.AddVars(vars(props)...) - builder.AddRow("General") - builder.AddPanel(general(props)...) + builder.AddRow("Engine") + builder.AddPanel(engine(props)...) + + builder.AddRow("Registry Syncer") + builder.AddPanel(registrySyncer(props)...) + + if props.SlackChannel != "" && props.SlackWebhookURL != "" { + builder.AddContactPoint(grafana.NewContactPoint(&grafana.ContactPointOptions{ + Name: "keystone-slack", + Type: "slack", + Settings: map[string]interface{}{ + "url": props.SlackWebhookURL, + "recipient": props.SlackChannel, + "username": "Keystone Alerts", + "title": `{{ template "slack.chainlink.title" . }}`, + "text": `{{ template "slack.chainlink.text" . }}`, + "color": `{{ template "slack.chainlink.color" . }}`, + }, + })) + + notificationPolicySlackOptions := &grafana.NotificationPolicyOptions{ + Receiver: "keystone-slack", + GroupBy: []string{"grafana_folder", "alertname"}, + Continue: grafana.Pointer(true), + } + for name, value := range props.AlertsTags { + notificationPolicySlackOptions.ObjectMatchers = append(notificationPolicySlackOptions.ObjectMatchers, alerting.ObjectMatcher{name, "=", value}) + } + builder.AddNotificationPolicy(grafana.NewNotificationPolicy(notificationPolicySlackOptions)) + } return builder.Build() } @@ -52,29 +86,10 @@ func vars(p *Props) []cog.Builder[dashboard.VariableModel] { Query: `label_values(platform_engine_workflows_count{env="$env"}, cluster)`, })) - variables = append(variables, grafana.NewQueryVariable(&grafana.QueryVariableOptions{ - VariableOption: &grafana.VariableOption{ - Label: "Workflow Owner", - Name: "workflowOwner", - }, - Datasource: p.MetricsDataSource.Name, - Query: `label_values(platform_engine_workflows_count{env="$env", cluster="$cluster"}, workflowOwner)`, - Multi: false, - })) - - variables = append(variables, grafana.NewQueryVariable(&grafana.QueryVariableOptions{ - VariableOption: &grafana.VariableOption{ - Label: "Workflow Name", - Name: "workflowName", - }, - Datasource: p.MetricsDataSource.Name, - Query: `label_values(platform_engine_workflows_count{env="$env", cluster="$cluster", workflowOwner="$workflowOwner"}, workflowName)`, - })) - return variables } -func general(p *Props) []*grafana.Panel { +func engine(p *Props) []*grafana.Panel { var panels []*grafana.Panel panels = append(panels, grafana.NewTimeSeriesPanel(&grafana.TimeSeriesPanelOptions{ @@ -86,36 +101,47 @@ func general(p *Props) []*grafana.Panel { Height: 8, Query: []grafana.Query{ { - Expr: `sum(platform_engine_workflows_count{` + p.platformOpts.LabelQuery + `}) by (workflowOwner, workflowName)`, + Expr: `sum(platform_engine_workflows_count{` + p.QueryFilters + `}) by (workflowOwner, workflowName)`, Legend: "{{ workflowOwner }} - {{ workflowName }}", }, }, }, - AlertOptions: &grafana.AlertOptions{ - Summary: "Keystone: No workflows are running", - Description: `The number of workflow running is {{ index $values "A" }}%`, - RunbookURL: "https://github.com/smartcontractkit/chainlink-common/tree/main/observability-lib", - For: "15m", - Tags: map[string]string{ - "severity": "critical", - }, - NoDataState: alerting.RuleNoDataStateOK, - Query: []grafana.RuleQuery{ - { - Expr: `sum(platform_engine_workflows_count{` + p.AlertsFilters + `})`, - RefID: "A", - Datasource: p.MetricsDataSource.UID, + AlertsOptions: []grafana.AlertOptions{ + { + Title: p.AlertsTitlePrefix + "[Engine] No Workflows Running", + Summary: "Platform Engine: No workflows are running", + Description: `{{ index $labels "job" }} number of workflow running is {{ index $values "B" }} in the last 1h`, + RunbookURL: "https://github.com/smartcontractkit/chainlink-common/tree/main/observability-lib", + For: "1h", + Tags: map[string]string{ + "severity": "critical", }, - }, - QueryRefCondition: "B", - Condition: []grafana.ConditionQuery{ - { - RefID: "B", - ThresholdExpression: &grafana.ThresholdExpression{ - Expression: "A", - ThresholdConditionsOptions: grafana.ThresholdConditionsOption{ - Params: []float64{1}, - Type: grafana.TypeThresholdTypeLt, + NoDataState: alerting.RuleNoDataStateOK, + Query: []grafana.RuleQuery{ + { + Expr: `platform_engine_workflows_count{` + p.AlertsFilters + `}`, + RefID: "A", + Datasource: p.MetricsDataSource.UID, + }, + }, + QueryRefCondition: "C", + // SUM(A) < 1 + Condition: []grafana.ConditionQuery{ + { + RefID: "B", + ReduceExpression: &grafana.ReduceExpression{ + Expression: "A", + Reducer: expr.TypeReduceReducerSum, + }, + }, + { + RefID: "C", + ThresholdExpression: &grafana.ThresholdExpression{ + Expression: "B", + ThresholdConditionsOptions: grafana.ThresholdConditionsOption{ + Params: []float64{1}, + Type: grafana.TypeThresholdTypeLt, + }, }, }, }, @@ -132,7 +158,7 @@ func general(p *Props) []*grafana.Panel { Height: 8, Query: []grafana.Query{ { - Expr: `sum(platform_engine_workflows_count{` + p.platformOpts.LabelQuery + `}) by (status)`, + Expr: `sum(platform_engine_workflows_count{` + p.QueryFilters + `}) by (status)`, Legend: "{{ status }}", }, }, @@ -142,15 +168,55 @@ func general(p *Props) []*grafana.Panel { panels = append(panels, grafana.NewTimeSeriesPanel(&grafana.TimeSeriesPanelOptions{ PanelOptions: &grafana.PanelOptions{ Datasource: p.MetricsDataSource.Name, - Title: "Workflow Execution Latency", + Title: "Register Trigger Failure", Description: "", Span: 8, Height: 8, - Unit: "ms", Query: []grafana.Query{ { - Expr: `sum(platform_engine_workflow_time{` + p.platformOpts.LabelQuery + `}) by (workflowExecutionID)`, - Legend: "WorkflowExecID: {{workflowExecutionID}}", + Expr: `platform_engine_registertrigger_failures{` + p.QueryFilters + `}`, + Legend: "", + }, + }, + }, + AlertsOptions: []grafana.AlertOptions{ + { + Title: p.AlertsTitlePrefix + "[Engine] Register Trigger Failure", + Summary: "Platform Engine: More than 1 failure over last 15m", + Description: `{{ index $labels "job" }} registered {{ index $values "A" }} trigger failures in the last 15m`, + RunbookURL: "https://github.com/smartcontractkit/chainlink-common/tree/main/observability-lib", + For: "15m", + Tags: map[string]string{ + "severity": "critical", + }, + NoDataState: alerting.RuleNoDataStateOK, + Query: []grafana.RuleQuery{ + { + Expr: `platform_engine_registertrigger_failures{` + p.AlertsFilters + `}`, + RefID: "A", + Datasource: p.MetricsDataSource.UID, + }, + }, + QueryRefCondition: "C", + // SUM(A) > 1 + Condition: []grafana.ConditionQuery{ + { + RefID: "B", + ReduceExpression: &grafana.ReduceExpression{ + Expression: "A", + Reducer: expr.TypeReduceReducerSum, + }, + }, + { + RefID: "C", + ThresholdExpression: &grafana.ThresholdExpression{ + Expression: "B", + ThresholdConditionsOptions: grafana.ThresholdConditionsOption{ + Params: []float64{1}, + Type: grafana.TypeThresholdTypeGt, + }, + }, + }, }, }, }, @@ -165,7 +231,7 @@ func general(p *Props) []*grafana.Panel { Height: 8, Query: []grafana.Query{ { - Expr: `platform_engine_workflow_errors{` + p.platformOpts.LabelQuery + `}`, + Expr: `platform_engine_workflow_errors{` + p.QueryFilters + `}`, Legend: "", }, }, @@ -175,14 +241,55 @@ func general(p *Props) []*grafana.Panel { panels = append(panels, grafana.NewTimeSeriesPanel(&grafana.TimeSeriesPanelOptions{ PanelOptions: &grafana.PanelOptions{ Datasource: p.MetricsDataSource.Name, - Title: "Register Trigger Failure", + Title: "Workflow Execution Latency p99", Description: "", Span: 8, Height: 8, + Unit: "ms", Query: []grafana.Query{ { - Expr: `platform_engine_registertrigger_failures{` + p.platformOpts.LabelQuery + `}`, - Legend: "", + Expr: `histogram_quantile(0.99, sum(rate(platform_engine_workflow_time{` + p.QueryFilters + `}[$__rate_interval])) by (le, job, workflowExecutionID))`, + Legend: "WorkflowExecID: {{workflowExecutionID}}", + }, + }, + }, + AlertsOptions: []grafana.AlertOptions{ + { + Title: p.AlertsTitlePrefix + "[Engine] Workflow Execution Latency p99", + Summary: "Workflow Execution latency (99th percentile) is high", + Description: `{{ index $labels "job" }} workflow latency is {{ index $values "B" }}ms`, + RunbookURL: "https://github.com/smartcontractkit/chainlink-common/tree/main/observability-lib", + For: "5m", + Tags: map[string]string{ + "severity": "critical", + }, + NoDataState: alerting.RuleNoDataStateOK, + Query: []grafana.RuleQuery{ + { + Expr: `histogram_quantile(0.99, sum(rate(platform_engine_workflow_time{` + p.AlertsFilters + `}[5m])) by (job, le))`, + RefID: "A", + Datasource: p.MetricsDataSource.UID, + }, + }, + QueryRefCondition: "C", + Condition: []grafana.ConditionQuery{ + { + RefID: "B", + ReduceExpression: &grafana.ReduceExpression{ + Expression: "A", + Reducer: expr.TypeReduceReducerMean, + }, + }, + { + RefID: "C", + ThresholdExpression: &grafana.ThresholdExpression{ + Expression: "B", + ThresholdConditionsOptions: grafana.ThresholdConditionsOption{ + Params: []float64{900000}, + Type: grafana.TypeThresholdTypeGt, + }, + }, + }, }, }, }, @@ -197,13 +304,19 @@ func general(p *Props) []*grafana.Panel { Height: 8, Query: []grafana.Query{ { - Expr: `platform_engine_capabilities_count{` + p.platformOpts.LabelQuery + `}`, + Expr: `platform_engine_capabilities_count{` + p.QueryFilters + `}`, Legend: "", }, }, }, })) + return panels +} + +func registrySyncer(p *Props) []*grafana.Panel { + var panels []*grafana.Panel + panels = append(panels, grafana.NewTimeSeriesPanel(&grafana.TimeSeriesPanelOptions{ PanelOptions: &grafana.PanelOptions{ Datasource: p.MetricsDataSource.Name, @@ -213,7 +326,7 @@ func general(p *Props) []*grafana.Panel { Height: 8, Query: []grafana.Query{ { - Expr: `platform_registrysyncer_sync_failures{` + p.platformOpts.LabelQuery + `}`, + Expr: `platform_registrysyncer_sync_failures{` + p.QueryFilters + `}`, Legend: "", }, }, @@ -229,7 +342,7 @@ func general(p *Props) []*grafana.Panel { Height: 8, Query: []grafana.Query{ { - Expr: `platform_registrysyncer_launch_failures{` + p.platformOpts.LabelQuery + `}`, + Expr: `platform_registrysyncer_launch_failures{` + p.QueryFilters + `}`, Legend: "", }, }, diff --git a/observability-lib/dashboards/keystone-workflows/component_test.go b/observability-lib/dashboards/keystone-workflows/component_test.go index fb38804af..15f525168 100644 --- a/observability-lib/dashboards/keystone-workflows/component_test.go +++ b/observability-lib/dashboards/keystone-workflows/component_test.go @@ -62,7 +62,7 @@ func TestNewDashboard(t *testing.T) { if err != nil { t.Errorf("Error creating dashboard: %v", err) } - require.IsType(t, grafana.Dashboard{}, *testDashboard) + require.IsType(t, grafana.Observability{}, *testDashboard) require.Equal(t, "Workflows", *testDashboard.Dashboard.Title) json, errJSON := testDashboard.GenerateJSON() if errJSON != nil { diff --git a/observability-lib/dashboards/keystone-workflows/platform.go b/observability-lib/dashboards/keystone-workflows/platform.go index 96199e3a8..a248da6eb 100644 --- a/observability-lib/dashboards/keystone-workflows/platform.go +++ b/observability-lib/dashboards/keystone-workflows/platform.go @@ -6,18 +6,12 @@ import ( "github.com/smartcontractkit/chainlink-common/observability-lib/grafana" ) -type platformOpts struct { - LabelFilters map[string]string - LabelFilter string - LegendString string - LabelQuery string -} - type Props struct { Name string // required: Name is the name of the dashboard MetricsDataSource *grafana.DataSource // required: MetricsDataSource is the datasource for querying metrics LogsDataSource *grafana.DataSource // required: LogsDataSource is the datasource for querying logs - platformOpts platformOpts + QueryFilters string + AlertsTitlePrefix string //optional AlertsFilters string //optional AlertsTags map[string]string SlackChannel string // optional @@ -54,24 +48,3 @@ func validateInput(props *Props) error { } return nil } - -func platformBuildOpts(props *Props) error { - if err := validateInput(props); err != nil { - return err - } - if !props.Tested { - po := platformOpts{ - LabelFilters: map[string]string{ - "env": `=~"${env}"`, - "cluster": `=~"${cluster}"`, - "workflowOwner": `=~"${workflowOwner}"`, - "workflowName": `=~"${workflowName}"`, - }, - } - for key, value := range po.LabelFilters { - po.LabelQuery += key + value + ", " - } - props.platformOpts = po - } - return nil -} diff --git a/observability-lib/dashboards/keystone-workflows/test-output.json b/observability-lib/dashboards/keystone-workflows/test-output.json index 88d8c83ae..fcf8198dd 100644 --- a/observability-lib/dashboards/keystone-workflows/test-output.json +++ b/observability-lib/dashboards/keystone-workflows/test-output.json @@ -18,7 +18,7 @@ { "type": "row", "collapsed": false, - "title": "General", + "title": "Engine", "gridPos": { "h": 1, "w": 24, @@ -33,7 +33,7 @@ "id": 1, "targets": [ { - "expr": "sum(platform_engine_workflows_count{}) by (workflowOwner, workflowName)", + "expr": "sum(platform_engine_workflows_count{env=~\"${env}\", cluster=~\"${cluster}\"}) by (workflowOwner, workflowName)", "format": "", "legendFormat": "{{ workflowOwner }} - {{ workflowName }}", "refId": "" @@ -83,7 +83,7 @@ "id": 2, "targets": [ { - "expr": "sum(platform_engine_workflows_count{}) by (status)", + "expr": "sum(platform_engine_workflows_count{env=~\"${env}\", cluster=~\"${cluster}\"}) by (status)", "format": "", "legendFormat": "{{ status }}", "refId": "" @@ -133,13 +133,13 @@ "id": 3, "targets": [ { - "expr": "sum(platform_engine_workflow_time{}) by (workflowExecutionID)", + "expr": "platform_engine_registertrigger_failures{env=~\"${env}\", cluster=~\"${cluster}\"}", "format": "", - "legendFormat": "WorkflowExecID: {{workflowExecutionID}}", + "legendFormat": "", "refId": "" } ], - "title": "Workflow Execution Latency", + "title": "Register Trigger Failure", "description": "", "transparent": false, "datasource": { @@ -165,7 +165,7 @@ }, "fieldConfig": { "defaults": { - "unit": "ms", + "unit": "", "decimals": 0, "noValue": "No data", "custom": { @@ -183,7 +183,7 @@ "id": 4, "targets": [ { - "expr": "platform_engine_workflow_errors{}", + "expr": "platform_engine_workflow_errors{env=~\"${env}\", cluster=~\"${cluster}\"}", "format": "", "legendFormat": "", "refId": "" @@ -233,13 +233,13 @@ "id": 5, "targets": [ { - "expr": "platform_engine_registertrigger_failures{}", + "expr": "histogram_quantile(0.99, sum(rate(platform_engine_workflow_time{env=~\"${env}\", cluster=~\"${cluster}\"}[$__rate_interval])) by (le, job, workflowExecutionID))", "format": "", - "legendFormat": "", + "legendFormat": "WorkflowExecID: {{workflowExecutionID}}", "refId": "" } ], - "title": "Register Trigger Failure", + "title": "Workflow Execution Latency p99", "description": "", "transparent": false, "datasource": { @@ -265,7 +265,7 @@ }, "fieldConfig": { "defaults": { - "unit": "", + "unit": "ms", "decimals": 0, "noValue": "No data", "custom": { @@ -283,7 +283,7 @@ "id": 6, "targets": [ { - "expr": "platform_engine_capabilities_count{}", + "expr": "platform_engine_capabilities_count{env=~\"${env}\", cluster=~\"${cluster}\"}", "format": "", "legendFormat": "", "refId": "" @@ -327,6 +327,119 @@ }, "overrides": null } + }, + { + "type": "row", + "collapsed": false, + "title": "Registry Syncer", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 0, + "panels": null + }, + { + "type": "timeseries", + "id": 7, + "targets": [ + { + "expr": "platform_registrysyncer_sync_failures{env=~\"${env}\", cluster=~\"${cluster}\"}", + "format": "", + "legendFormat": "", + "refId": "" + } + ], + "title": "Registry Syncer Failures", + "description": "", + "transparent": false, + "datasource": { + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 18 + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "calcs": [] + }, + "tooltip": { + "mode": "", + "sort": "" + } + }, + "fieldConfig": { + "defaults": { + "unit": "", + "decimals": 0, + "noValue": "No data", + "custom": { + "fillOpacity": 0, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": null + } + }, + { + "type": "timeseries", + "id": 8, + "targets": [ + { + "expr": "platform_registrysyncer_launch_failures{env=~\"${env}\", cluster=~\"${cluster}\"}", + "format": "", + "legendFormat": "", + "refId": "" + } + ], + "title": "Registry Syncer Launcher Failures", + "description": "", + "transparent": false, + "datasource": { + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 18 + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "calcs": [] + }, + "tooltip": { + "mode": "", + "sort": "" + } + }, + "fieldConfig": { + "defaults": { + "unit": "", + "decimals": 0, + "noValue": "No data", + "custom": { + "fillOpacity": 0, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": null + } } ], "templating": { @@ -372,66 +485,102 @@ }, "multi": false, "sort": 1 - }, + } + ] + }, + "annotations": {} + }, + "Alerts": [ + { + "annotations": { + "description": "{{ index $labels \"job\" }} number of workflow running is {{ index $values \"B\" }} in the last 1h", + "panel_title": "Workflows Running", + "runbook_url": "https://github.com/smartcontractkit/chainlink-common/tree/main/observability-lib", + "summary": "Platform Engine: No workflows are running" + }, + "condition": "C", + "data": [ { - "type": "query", - "name": "workflowOwner", - "label": "Workflow Owner", - "description": "", - "query": "label_values(platform_engine_workflows_count{env=\"$env\", cluster=\"$cluster\"}, workflowOwner)", - "datasource": { - "uid": "Prometheus" - }, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "datasourceUid": "1", + "model": { + "expr": "platform_engine_workflows_count{}", + "legendFormat": "__auto", + "refId": "A" }, - "multi": false, - "sort": 1 + "refId": "A", + "relativeTimeRange": { + "from": 600, + "to": 0 + } }, { - "type": "query", - "name": "workflowName", - "label": "Workflow Name", - "description": "", - "query": "label_values(platform_engine_workflows_count{env=\"$env\", cluster=\"$cluster\", workflowOwner=\"$workflowOwner\"}, workflowName)", - "datasource": { - "uid": "Prometheus" + "datasourceUid": "__expr__", + "model": { + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "reducer": "sum", + "refId": "B", + "type": "reduce" }, - "current": { - "selected": true, - "text": [ - "All" + "refId": "B", + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 1, + 0 + ], + "type": "lt" + } + } ], - "value": [ - "$__all" - ] + "expression": "B", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" }, - "multi": false, - "sort": 1 + "refId": "C", + "relativeTimeRange": { + "from": 600, + "to": 0 + } } - ] + ], + "execErrState": "Alerting", + "folderUID": "", + "for": "1h", + "labels": { + "severity": "critical", + "team": "keystone" + }, + "noDataState": "OK", + "orgID": 0, + "ruleGroup": "", + "title": "[Keystone][Engine] No Workflows Running" }, - "annotations": {} - }, - "Alerts": [ { "annotations": { - "description": "The number of workflow running is {{ index $values \"A\" }}%", + "description": "{{ index $labels \"job\" }} registered {{ index $values \"A\" }} trigger failures in the last 15m", + "panel_title": "Register Trigger Failure", "runbook_url": "https://github.com/smartcontractkit/chainlink-common/tree/main/observability-lib", - "summary": "Keystone: No workflows are running" + "summary": "Platform Engine: More than 1 failure over last 15m" }, - "condition": "B", + "condition": "C", "data": [ { "datasourceUid": "1", "model": { - "expr": "sum(platform_engine_workflows_count{})", + "expr": "platform_engine_registertrigger_failures{}", "legendFormat": "__auto", "refId": "A" }, @@ -441,6 +590,22 @@ "to": 0 } }, + { + "datasourceUid": "__expr__", + "model": { + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "reducer": "sum", + "refId": "B", + "type": "reduce" + }, + "refId": "B", + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, { "datasourceUid": "__expr__", "model": { @@ -451,33 +616,111 @@ 1, 0 ], - "type": "lt" + "type": "gt" } } ], + "expression": "B", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + }, + "refId": "C", + "relativeTimeRange": { + "from": 600, + "to": 0 + } + } + ], + "execErrState": "Alerting", + "folderUID": "", + "for": "15m", + "labels": { + "severity": "critical", + "team": "keystone" + }, + "noDataState": "OK", + "orgID": 0, + "ruleGroup": "", + "title": "[Keystone][Engine] Register Trigger Failure" + }, + { + "annotations": { + "description": "{{ index $labels \"job\" }} workflow latency is {{ index $values \"B\" }}ms", + "panel_title": "Workflow Execution Latency p99", + "runbook_url": "https://github.com/smartcontractkit/chainlink-common/tree/main/observability-lib", + "summary": "Workflow Execution latency (99th percentile) is high" + }, + "condition": "C", + "data": [ + { + "datasourceUid": "1", + "model": { + "expr": "histogram_quantile(0.99, sum(rate(platform_engine_workflow_time{}[5m])) by (job, le))", + "legendFormat": "__auto", + "refId": "A" + }, + "refId": "A", + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "datasourceUid": "__expr__", + "model": { "expression": "A", "intervalMs": 1000, "maxDataPoints": 43200, + "reducer": "mean", "refId": "B", - "type": "threshold" + "type": "reduce" }, "refId": "B", "relativeTimeRange": { "from": 600, "to": 0 } + }, + { + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 900000, + 0 + ], + "type": "gt" + } + } + ], + "expression": "B", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + }, + "refId": "C", + "relativeTimeRange": { + "from": 600, + "to": 0 + } } ], "execErrState": "Alerting", "folderUID": "", - "for": "15m", + "for": "5m", "labels": { - "severity": "critical" + "severity": "critical", + "team": "keystone" }, "noDataState": "OK", "orgID": 0, "ruleGroup": "", - "title": "Workflows Running" + "title": "[Keystone][Engine] Workflow Execution Latency p99" } ], "ContactPoints": null,