mirror of
https://github.com/Stunkymonkey/nixos.git
synced 2026-01-23 06:53:24 +01:00
This commit is contained in:
parent
8b56443b2d
commit
7129af88ba
4 changed files with 20 additions and 20 deletions
|
|
@ -92,7 +92,7 @@ in
|
||||||
|
|
||||||
my.services.prometheus.rules = {
|
my.services.prometheus.rules = {
|
||||||
BlackboxProbeFailed = {
|
BlackboxProbeFailed = {
|
||||||
condition = ''probe_success == 0'';
|
condition = "probe_success == 0";
|
||||||
description = "Blackbox probe failed (instance {{ $labels.instance }}): {{$value}}";
|
description = "Blackbox probe failed (instance {{ $labels.instance }}): {{$value}}";
|
||||||
time = "1m";
|
time = "1m";
|
||||||
labels = {
|
labels = {
|
||||||
|
|
@ -100,7 +100,7 @@ in
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
BlackboxConfigurationReloadFailure = {
|
BlackboxConfigurationReloadFailure = {
|
||||||
condition = ''blackbox_exporter_config_last_reload_successful != 1'';
|
condition = "blackbox_exporter_config_last_reload_successful != 1";
|
||||||
description = "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
description = "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
||||||
time = "0m";
|
time = "0m";
|
||||||
labels = {
|
labels = {
|
||||||
|
|
@ -108,7 +108,7 @@ in
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
BlackboxSlowProbe = {
|
BlackboxSlowProbe = {
|
||||||
condition = ''avg_over_time(probe_duration_seconds[1m]) > 2'';
|
condition = "avg_over_time(probe_duration_seconds[1m]) > 2";
|
||||||
description = "Blackbox probe took more than 2s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
description = "Blackbox probe took more than 2s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
||||||
time = "1m";
|
time = "1m";
|
||||||
labels = {
|
labels = {
|
||||||
|
|
@ -116,12 +116,12 @@ in
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
BlackboxProbeHttpFailure = {
|
BlackboxProbeHttpFailure = {
|
||||||
condition = ''probe_http_status_code <= 199 OR probe_http_status_code >= 400'';
|
condition = "probe_http_status_code <= 199 OR probe_http_status_code >= 400";
|
||||||
description = "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
description = "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
||||||
time = "1m";
|
time = "1m";
|
||||||
};
|
};
|
||||||
BlackboxSslCertificateWillExpireSoon = {
|
BlackboxSslCertificateWillExpireSoon = {
|
||||||
condition = ''3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'';
|
condition = "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20";
|
||||||
description = "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
description = "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
||||||
time = "0m";
|
time = "0m";
|
||||||
labels = {
|
labels = {
|
||||||
|
|
@ -129,7 +129,7 @@ in
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
BlackboxSslCertificateWillExpireShortly = {
|
BlackboxSslCertificateWillExpireShortly = {
|
||||||
condition = ''0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'';
|
condition = "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3";
|
||||||
description = "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
description = "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
||||||
time = "0m";
|
time = "0m";
|
||||||
labels = {
|
labels = {
|
||||||
|
|
@ -137,7 +137,7 @@ in
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
BlackboxProbeSlowHttp = {
|
BlackboxProbeSlowHttp = {
|
||||||
condition = ''avg_over_time(probe_http_duration_seconds[1m]) > 2'';
|
condition = "avg_over_time(probe_http_duration_seconds[1m]) > 2";
|
||||||
description = "HTTP request took more than 2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
description = "HTTP request took more than 2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
||||||
time = "1m";
|
time = "1m";
|
||||||
labels = {
|
labels = {
|
||||||
|
|
@ -145,7 +145,7 @@ in
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
BlackboxProbeSlowPing = {
|
BlackboxProbeSlowPing = {
|
||||||
condition = ''avg_over_time(probe_icmp_duration_seconds[1m]) > 1'';
|
condition = "avg_over_time(probe_icmp_duration_seconds[1m]) > 1";
|
||||||
description = "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
description = "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
|
||||||
time = "1m";
|
time = "1m";
|
||||||
labels = {
|
labels = {
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,7 @@ in
|
||||||
|
|
||||||
my.services.prometheus.rules = {
|
my.services.prometheus.rules = {
|
||||||
navidrome_not_enough_albums = {
|
navidrome_not_enough_albums = {
|
||||||
condition = ''http_navidrome_album_count != 1'';
|
condition = "http_navidrome_album_count != 1";
|
||||||
description = "navidrome: not enough albums as expected: {{$value}}";
|
description = "navidrome: not enough albums as expected: {{$value}}";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -115,7 +115,7 @@ in
|
||||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% space left on its filesystem";
|
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% space left on its filesystem";
|
||||||
};
|
};
|
||||||
filesystem_inodes_full = {
|
filesystem_inodes_full = {
|
||||||
condition = ''node_filesystem_files_free / node_filesystem_files < 0.10'';
|
condition = "node_filesystem_files_free / node_filesystem_files < 0.10";
|
||||||
time = "10m";
|
time = "10m";
|
||||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
|
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
|
||||||
};
|
};
|
||||||
|
|
@ -125,11 +125,11 @@ in
|
||||||
description = "{{$labels.instance}}: filesystem has reported {{$value}} errors: check /sys/fs/ext4/*/errors_count";
|
description = "{{$labels.instance}}: filesystem has reported {{$value}} errors: check /sys/fs/ext4/*/errors_count";
|
||||||
};
|
};
|
||||||
disk_unusual_read = {
|
disk_unusual_read = {
|
||||||
condition = ''sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'';
|
condition = "sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50";
|
||||||
description = ''Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}'';
|
description = ''Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}'';
|
||||||
};
|
};
|
||||||
disk_unusual_write = {
|
disk_unusual_write = {
|
||||||
condition = ''sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'';
|
condition = "sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50";
|
||||||
description = ''Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}'';
|
description = ''Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}'';
|
||||||
};
|
};
|
||||||
# memory
|
# memory
|
||||||
|
|
@ -144,21 +144,21 @@ in
|
||||||
description = ''CPU load is > 80%\n VALUE = {{ $value }}'';
|
description = ''CPU load is > 80%\n VALUE = {{ $value }}'';
|
||||||
};
|
};
|
||||||
swap_is_filling = {
|
swap_is_filling = {
|
||||||
condition = ''(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80'';
|
condition = "(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80";
|
||||||
description = "{{$labels.host}} is using {{$value}} (>80%) of its swap space";
|
description = "{{$labels.host}} is using {{$value}} (>80%) of its swap space";
|
||||||
};
|
};
|
||||||
oom_kill_detected = {
|
oom_kill_detected = {
|
||||||
condition = ''increase(node_vmstat_oom_kill[1m]) > 0'';
|
condition = "increase(node_vmstat_oom_kill[1m]) > 0";
|
||||||
description = ''OOM kill detected\n VALUE = {{ $value }}'';
|
description = ''OOM kill detected\n VALUE = {{ $value }}'';
|
||||||
time = "0m";
|
time = "0m";
|
||||||
};
|
};
|
||||||
# network
|
# network
|
||||||
network_unusual_throughput_in = {
|
network_unusual_throughput_in = {
|
||||||
condition = ''sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'';
|
condition = "sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100";
|
||||||
description = ''Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}'';
|
description = ''Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}'';
|
||||||
};
|
};
|
||||||
network_unusual_throughput_out = {
|
network_unusual_throughput_out = {
|
||||||
condition = ''sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'';
|
condition = "sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100";
|
||||||
description = ''Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}'';
|
description = ''Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}'';
|
||||||
};
|
};
|
||||||
# uptime
|
# uptime
|
||||||
|
|
@ -167,7 +167,7 @@ in
|
||||||
description = "{{$labels.host}} just rebooted";
|
description = "{{$labels.host}} just rebooted";
|
||||||
};
|
};
|
||||||
uptime = {
|
uptime = {
|
||||||
condition = ''node_time_seconds - node_boot_time_seconds > (30 * 24 * 60 * 60)'';
|
condition = "node_time_seconds - node_boot_time_seconds > (30 * 24 * 60 * 60)";
|
||||||
description = "Uptime monster: {{$labels.host}} has been up for more than 30 days";
|
description = "Uptime monster: {{$labels.host}} has been up for more than 30 days";
|
||||||
};
|
};
|
||||||
# systemd
|
# systemd
|
||||||
|
|
@ -178,7 +178,7 @@ in
|
||||||
};
|
};
|
||||||
# time
|
# time
|
||||||
clock_not_synchronising = {
|
clock_not_synchronising = {
|
||||||
condition = ''min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16'';
|
condition = "min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16";
|
||||||
description = ''Clock not synchronising.\n VALUE = {{ $value }}'';
|
description = ''Clock not synchronising.\n VALUE = {{ $value }}'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -61,9 +61,9 @@ in
|
||||||
};
|
};
|
||||||
|
|
||||||
promtail_file_lagging = {
|
promtail_file_lagging = {
|
||||||
condition = ''abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6'';
|
condition = "abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6";
|
||||||
time = "15m";
|
time = "15m";
|
||||||
description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m'';
|
description = "{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue