From 2315f51b93f17256f083d8d8ecd261954975e1a2 Mon Sep 17 00:00:00 2001 From: Felix Buehler Date: Sun, 2 Jul 2023 22:57:03 +0200 Subject: [PATCH] service/blackbox: add alerts --- modules/services/blackbox/default.nix | 63 +++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/modules/services/blackbox/default.nix b/modules/services/blackbox/default.nix index bd480ea..0598ef3 100644 --- a/modules/services/blackbox/default.nix +++ b/modules/services/blackbox/default.nix @@ -81,6 +81,69 @@ in ]; }; + my.services.prometheus.rules = { + BlackboxProbeFailed = { + condition = ''probe_success == 0''; + description = "Blackbox probe failed (instance {{ $labels.instance }}): {{$value}}"; + time = "0m"; + labels = { + severity = "critical"; + }; + }; + BlackboxConfigurationReloadFailure = { + condition = ''blackbox_exporter_config_last_reload_successful != 1''; + description = "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + time = "0m"; + labels = { + severity = "warning"; + }; + }; + BlackboxSlowProbe = { + condition = ''avg_over_time(probe_duration_seconds[1m]) > 1''; + description = "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + time = "1m"; + labels = { + severity = "warning"; + }; + }; + BlackboxProbeHttpFailure = { + condition = ''probe_http_status_code <= 199 OR probe_http_status_code >= 400''; + description = "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + time = "0m"; + }; + BlackboxSslCertificateWillExpireSoon = { + condition = ''3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20''; + description = "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + time = "0m"; + labels = { + severity = "warning"; + }; + }; + BlackboxSslCertificateWillExpireShortly = { + condition = ''0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3''; + description = "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + time = "0m"; + labels = { + severity = "critical"; + }; + }; + BlackboxProbeSlowHttp = { + condition = ''avg_over_time(probe_http_duration_seconds[1m]) > 1''; + description = "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + time = "1m"; + labels = { + severity = "warning"; + }; + }; + BlackboxProbeSlowPing = { + condition = ''avg_over_time(probe_icmp_duration_seconds[1m]) > 1''; + description = "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + time = "1m"; + labels = { + severity = "warning"; + }; + }; + }; services.grafana.provision.dashboards.settings.providers = [ { name = "Blackbox";