mirror of
https://github.com/Stunkymonkey/nixos.git
synced 2025-05-24 18:04:41 +02:00
178 lines
6.9 KiB
Nix
178 lines
6.9 KiB
Nix
# monitoring system services
|
|
{ config, lib, pkgs, inputs, ... }:
|
|
let
|
|
cfg = config.my.services.node-exporter;
|
|
domain = config.networking.domain;
|
|
in
|
|
{
|
|
options.my.services.node-exporter = with lib; {
|
|
enable = mkEnableOption "Node-Exporter for monitoring";
|
|
};
|
|
|
|
config = lib.mkIf cfg.enable {
|
|
services.prometheus = {
|
|
exporters = {
|
|
node = {
|
|
enable = true;
|
|
enabledCollectors = [
|
|
"systemd"
|
|
"textfile"
|
|
];
|
|
extraFlags = [
|
|
"--collector.textfile.directory=/etc/prometheus-node-exporter-text-files"
|
|
];
|
|
port = 9100;
|
|
listenAddress = "127.0.0.1";
|
|
};
|
|
systemd = {
|
|
enable = true;
|
|
listenAddress = "127.0.0.1";
|
|
};
|
|
};
|
|
|
|
scrapeConfigs = [
|
|
{
|
|
job_name = "node";
|
|
static_configs = [{
|
|
targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.node.port}" ];
|
|
labels = {
|
|
instance = config.networking.hostName;
|
|
};
|
|
}];
|
|
}
|
|
{
|
|
job_name = "systemd";
|
|
static_configs = [{
|
|
targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.systemd.port}" ];
|
|
labels = {
|
|
instance = config.networking.hostName;
|
|
};
|
|
}];
|
|
}
|
|
];
|
|
};
|
|
|
|
# inputs == flake inputs in configurations.nix
|
|
environment.etc =
|
|
let
|
|
inputsWithDate = lib.filterAttrs (_: input: input ? lastModified) inputs;
|
|
flakeAttrs = input: (lib.mapAttrsToList (n: v: ''${n}="${v}"'')
|
|
(lib.filterAttrs (n: v: (builtins.typeOf v) == "string") input));
|
|
lastModified = name: input: ''
|
|
flake_input_last_modified{input="${name}",${lib.concatStringsSep "," (flakeAttrs input)}} ${toString input.lastModified}'';
|
|
in
|
|
{
|
|
"prometheus-node-exporter-text-files/flake-inputs.prom" = {
|
|
mode = "0555";
|
|
text = ''
|
|
# HELP flake_registry_last_modified Last modification date of flake input in unixtime
|
|
# TYPE flake_input_last_modified gauge
|
|
${lib.concatStringsSep "\n" (lib.mapAttrsToList lastModified inputsWithDate)}
|
|
'';
|
|
};
|
|
};
|
|
|
|
services.grafana.provision = {
|
|
dashboards.settings.providers = [
|
|
{
|
|
name = "Node Exporter";
|
|
options.path = pkgs.grafana-dashboards.node-exporter;
|
|
disableDeletion = true;
|
|
}
|
|
{
|
|
name = "Systemd";
|
|
options.path = pkgs.grafana-dashboards.node-systemd;
|
|
disableDeletion = true;
|
|
}
|
|
];
|
|
};
|
|
|
|
my.services.prometheus.rules = {
|
|
nixpkgs_out_of_date = {
|
|
condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60 * 60 * 24) > 7'';
|
|
description = "{{$labels.host}}: nixpkgs flake is older than a week";
|
|
};
|
|
# disk space
|
|
filesystem_full_shortly = {
|
|
condition = "predict_linear(node_filesystem_free[1h], (4 * 60 * 60)) < 0";
|
|
time = "5m";
|
|
description = "Disk would fill up in 4 hours. Please check the disk space";
|
|
labels = {
|
|
severity = "page";
|
|
};
|
|
};
|
|
filesystem_almost_full = {
|
|
condition = ''100 - ((node_filesystem_avail_bytes{fstype!~"tmpfs|ramfs",mountpoint!="/nix/store"} * 100) / node_filesystem_size_bytes{fstype!~"tmpfs|ramfs",mountpoint!="/nix/store"}) >= 90'';
|
|
time = "10m";
|
|
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% space left on its filesystem";
|
|
};
|
|
filesystem_inodes_full = {
|
|
condition = ''node_filesystem_files_free / node_filesystem_files < 0.10'';
|
|
time = "10m";
|
|
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
|
|
};
|
|
# disk errors
|
|
filesystem_errors = {
|
|
condition = ''node_filesystem_device_error{fstype!~"tmpfs|ramfs"} > 0'';
|
|
description = "{{$labels.instance}}: filesystem has reported {{$value}} errors: check /sys/fs/ext4/*/errors_count";
|
|
};
|
|
disk_unusual_read = {
|
|
condition = ''sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'';
|
|
description = ''Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}'';
|
|
};
|
|
disk_unusual_write = {
|
|
condition = ''sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'';
|
|
description = ''Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}'';
|
|
};
|
|
# memory
|
|
ram_almost_full = {
|
|
condition = "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10";
|
|
time = "1h";
|
|
description = ''Node memory is filling up (< 10% left)\n VALUE = {{ $value }}'';
|
|
};
|
|
load_high = {
|
|
condition = ''100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80'';
|
|
time = "0m";
|
|
description = ''CPU load is > 80%\n VALUE = {{ $value }}'';
|
|
};
|
|
swap_is_filling = {
|
|
condition = ''(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80'';
|
|
description = "{{$labels.host}} is using {{$value}} (>80%) of its swap space";
|
|
};
|
|
oom_kill_detected = {
|
|
condition = ''increase(node_vmstat_oom_kill[1m]) > 0'';
|
|
description = ''OOM kill detected\n VALUE = {{ $value }}'';
|
|
time = "0m";
|
|
};
|
|
# network
|
|
network_unusual_throughput_in = {
|
|
condition = ''sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'';
|
|
description = ''Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}'';
|
|
};
|
|
network_unusual_throughput_out = {
|
|
condition = ''sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'';
|
|
description = ''Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}'';
|
|
};
|
|
# uptime
|
|
reboot = {
|
|
condition = "node_time_seconds - node_boot_time_seconds < (5 * 30)";
|
|
description = "{{$labels.host}} just rebooted";
|
|
};
|
|
uptime = {
|
|
condition = ''node_time_seconds - node_boot_time_seconds > (30 * 24 * 60 * 60)'';
|
|
description = "Uptime monster: {{$labels.host}} has been up for more than 30 days";
|
|
};
|
|
# systemd
|
|
systemd_crashed = {
|
|
# ignore user services
|
|
condition = ''node_systemd_unit_state{state="failed", name!~"user@\\d+.service"} == 1'';
|
|
description = "Host SystemD service crashed (instance {{ $labels.instance }})";
|
|
};
|
|
# time
|
|
clock_not_synchronising = {
|
|
condition = ''min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16'';
|
|
description = ''Clock not synchronising.\n VALUE = {{ $value }}'';
|
|
};
|
|
};
|
|
};
|
|
}
|