From b18ff2012c7e034ee3391995e0878080202a1d5a Mon Sep 17 00:00:00 2001 From: mostafa ghadimi Date: Mon, 9 Sep 2024 03:01:46 +0330 Subject: [PATCH] :wrench: feat: modify the alert rule for DAG failure --- config_files/prometheus/alerts/airflow.rules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config_files/prometheus/alerts/airflow.rules b/config_files/prometheus/alerts/airflow.rules index 08706ae..6af0b5e 100644 --- a/config_files/prometheus/alerts/airflow.rules +++ b/config_files/prometheus/alerts/airflow.rules @@ -2,7 +2,7 @@ groups: - name: airflow_alerts rules: - alert: AirflowDagFailed - expr: sum by (dag_id) (af_agg_dagrun_duration_failed) > 0 + expr: sum by (dag_id, task_id) (increase(af_agg_ti_finish{state="failed"}[5m])) > 0 for: 1m labels: severity: critical @@ -17,4 +17,4 @@ groups: severity: critical annotations: summary: "Airflow task failed: {{ $labels.task_name }}" - description: "Task '{{ $labels.task_name }}' in DAG '{{ $labels.dag_id }}' failed." \ No newline at end of file + description: "Task '{{ $labels.task_name }}' in DAG '{{ $labels.dag_id }}' failed."