| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  | # monitoring system services | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  | { | 
					
						
							|  |  |  |   config, | 
					
						
							|  |  |  |   lib, | 
					
						
							|  |  |  |   pkgs, | 
					
						
							|  |  |  |   inputs, | 
					
						
							|  |  |  |   ... | 
					
						
							|  |  |  | }: | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  | let | 
					
						
							|  |  |  |   cfg = config.my.services.node-exporter; | 
					
						
							|  |  |  | in | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   options.my.services.node-exporter = with lib; { | 
					
						
							|  |  |  |     enable = mkEnableOption "Node-Exporter for monitoring"; | 
					
						
							|  |  |  |   }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   config = lib.mkIf cfg.enable { | 
					
						
							|  |  |  |     services.prometheus = { | 
					
						
							|  |  |  |       exporters = { | 
					
						
							|  |  |  |         node = { | 
					
						
							|  |  |  |           enable = true; | 
					
						
							| 
									
										
										
										
											2023-06-19 00:14:06 +02:00
										 |  |  |           enabledCollectors = [ | 
					
						
							|  |  |  |             "systemd" | 
					
						
							|  |  |  |             "textfile" | 
					
						
							|  |  |  |           ]; | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |           extraFlags = [ "--collector.textfile.directory=/etc/prometheus-node-exporter-text-files" ]; | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  |           port = 9100; | 
					
						
							|  |  |  |           listenAddress = "127.0.0.1"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |         systemd = { | 
					
						
							|  |  |  |           enable = true; | 
					
						
							|  |  |  |           listenAddress = "127.0.0.1"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       scrapeConfigs = [ | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |           job_name = "node"; | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |           static_configs = [ | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |               targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.node.port}" ]; | 
					
						
							|  |  |  |               labels = { | 
					
						
							|  |  |  |                 instance = config.networking.hostName; | 
					
						
							|  |  |  |               }; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |           ]; | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |           job_name = "systemd"; | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |           static_configs = [ | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |               targets = [ "127.0.0.1:${toString config.services.prometheus.exporters.systemd.port}" ]; | 
					
						
							|  |  |  |               labels = { | 
					
						
							|  |  |  |                 instance = config.networking.hostName; | 
					
						
							|  |  |  |               }; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |           ]; | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |       ]; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-19 00:14:06 +02:00
										 |  |  |     # inputs == flake inputs in configurations.nix | 
					
						
							|  |  |  |     environment.etc = | 
					
						
							|  |  |  |       let | 
					
						
							|  |  |  |         inputsWithDate = lib.filterAttrs (_: input: input ? lastModified) inputs; | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |         flakeAttrs = | 
					
						
							|  |  |  |           input: | 
					
						
							|  |  |  |           (lib.mapAttrsToList (n: v: ''${n}="${v}"'') ( | 
					
						
							|  |  |  |             lib.filterAttrs (_n: v: (builtins.typeOf v) == "string") input | 
					
						
							|  |  |  |           )); | 
					
						
							|  |  |  |         lastModified = | 
					
						
							|  |  |  |           name: input: | 
					
						
							|  |  |  |           ''flake_input_last_modified{input="${name}",${lib.concatStringsSep "," (flakeAttrs input)}} ${toString input.lastModified}''; | 
					
						
							| 
									
										
										
										
											2023-06-19 00:14:06 +02:00
										 |  |  |       in | 
					
						
							|  |  |  |       { | 
					
						
							|  |  |  |         "prometheus-node-exporter-text-files/flake-inputs.prom" = { | 
					
						
							|  |  |  |           mode = "0555"; | 
					
						
							|  |  |  |           text = ''
 | 
					
						
							|  |  |  |             # HELP flake_registry_last_modified Last modification date of flake input in unixtime | 
					
						
							|  |  |  |             # TYPE flake_input_last_modified gauge | 
					
						
							|  |  |  |             ${lib.concatStringsSep "\n" (lib.mapAttrsToList lastModified inputsWithDate)} | 
					
						
							|  |  |  |           '';
 | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  |     services.grafana.provision = { | 
					
						
							|  |  |  |       dashboards.settings.providers = [ | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |           name = "Node Exporter"; | 
					
						
							|  |  |  |           options.path = pkgs.grafana-dashboards.node-exporter; | 
					
						
							|  |  |  |           disableDeletion = true; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |           name = "Systemd"; | 
					
						
							|  |  |  |           options.path = pkgs.grafana-dashboards.node-systemd; | 
					
						
							|  |  |  |           disableDeletion = true; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |       ]; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     my.services.prometheus.rules = { | 
					
						
							| 
									
										
										
										
											2023-06-19 00:14:06 +02:00
										 |  |  |       nixpkgs_out_of_date = { | 
					
						
							| 
									
										
										
										
											2023-09-10 21:50:53 +02:00
										 |  |  |         condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60 * 60 * 24) > 14''; | 
					
						
							|  |  |  |         description = "{{$labels.host}}: nixpkgs flake is older than two weeks"; | 
					
						
							| 
									
										
										
										
											2023-06-19 00:14:06 +02:00
										 |  |  |       }; | 
					
						
							| 
									
										
										
										
											2023-06-18 22:50:49 +02:00
										 |  |  |       # disk space | 
					
						
							|  |  |  |       filesystem_full_shortly = { | 
					
						
							|  |  |  |         condition = "predict_linear(node_filesystem_free[1h], (4 * 60 * 60)) < 0"; | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  |         time = "5m"; | 
					
						
							|  |  |  |         description = "Disk would fill up in 4 hours. Please check the disk space"; | 
					
						
							|  |  |  |         labels = { | 
					
						
							|  |  |  |           severity = "page"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |       }; | 
					
						
							| 
									
										
										
										
											2023-06-18 22:50:49 +02:00
										 |  |  |       filesystem_almost_full = { | 
					
						
							|  |  |  |         condition = ''100 - ((node_filesystem_avail_bytes{fstype!~"tmpfs|ramfs",mountpoint!="/nix/store"} * 100) / node_filesystem_size_bytes{fstype!~"tmpfs|ramfs",mountpoint!="/nix/store"}) >= 90''; | 
					
						
							|  |  |  |         time = "10m"; | 
					
						
							|  |  |  |         description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% space left on its filesystem"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       filesystem_inodes_full = { | 
					
						
							|  |  |  |         condition = ''node_filesystem_files_free / node_filesystem_files < 0.10''; | 
					
						
							|  |  |  |         time = "10m"; | 
					
						
							|  |  |  |         description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       # disk errors | 
					
						
							|  |  |  |       filesystem_errors = { | 
					
						
							|  |  |  |         condition = ''node_filesystem_device_error{fstype!~"tmpfs|ramfs"} > 0''; | 
					
						
							|  |  |  |         description = "{{$labels.instance}}: filesystem has reported {{$value}} errors: check /sys/fs/ext4/*/errors_count"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       disk_unusual_read = { | 
					
						
							|  |  |  |         condition = ''sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50''; | 
					
						
							|  |  |  |         description = ''Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       disk_unusual_write = { | 
					
						
							|  |  |  |         condition = ''sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50''; | 
					
						
							|  |  |  |         description = ''Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       # memory | 
					
						
							|  |  |  |       ram_almost_full = { | 
					
						
							|  |  |  |         condition = "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10"; | 
					
						
							|  |  |  |         time = "1h"; | 
					
						
							|  |  |  |         description = ''Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       load_high = { | 
					
						
							|  |  |  |         condition = ''100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80''; | 
					
						
							|  |  |  |         time = "0m"; | 
					
						
							|  |  |  |         description = ''CPU load is > 80%\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       swap_is_filling = { | 
					
						
							|  |  |  |         condition = ''(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80''; | 
					
						
							|  |  |  |         description = "{{$labels.host}} is using {{$value}} (>80%) of its swap space"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       oom_kill_detected = { | 
					
						
							|  |  |  |         condition = ''increase(node_vmstat_oom_kill[1m]) > 0''; | 
					
						
							|  |  |  |         description = ''OOM kill detected\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |         time = "0m"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       # network | 
					
						
							|  |  |  |       network_unusual_throughput_in = { | 
					
						
							|  |  |  |         condition = ''sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100''; | 
					
						
							|  |  |  |         description = ''Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       network_unusual_throughput_out = { | 
					
						
							|  |  |  |         condition = ''sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100''; | 
					
						
							|  |  |  |         description = ''Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       # uptime | 
					
						
							|  |  |  |       reboot = { | 
					
						
							|  |  |  |         condition = "node_time_seconds - node_boot_time_seconds < (5 * 30)"; | 
					
						
							|  |  |  |         description = "{{$labels.host}} just rebooted"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       uptime = { | 
					
						
							|  |  |  |         condition = ''node_time_seconds - node_boot_time_seconds > (30 * 24 * 60 * 60)''; | 
					
						
							|  |  |  |         description = "Uptime monster: {{$labels.host}} has been up for more than 30 days"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       # systemd | 
					
						
							| 
									
										
										
										
											2023-12-02 00:25:10 +01:00
										 |  |  |       systemd_unit_crashed = { | 
					
						
							| 
									
										
										
										
											2023-07-23 19:11:19 +02:00
										 |  |  |         # ignore user services | 
					
						
							|  |  |  |         condition = ''node_systemd_unit_state{state="failed", name!~"user@\\d+.service"} == 1''; | 
					
						
							| 
									
										
										
										
											2023-06-18 22:50:49 +02:00
										 |  |  |         description = "Host SystemD service crashed (instance {{ $labels.instance }})"; | 
					
						
							|  |  |  |       }; | 
					
						
							|  |  |  |       # time | 
					
						
							| 
									
										
										
										
											2023-06-22 20:54:16 +02:00
										 |  |  |       clock_not_synchronising = { | 
					
						
							| 
									
										
										
										
											2023-06-18 22:50:49 +02:00
										 |  |  |         condition = ''min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16''; | 
					
						
							|  |  |  |         description = ''Clock not synchronising.\n  VALUE = {{ $value }}''; | 
					
						
							|  |  |  |       }; | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  |     }; | 
					
						
							|  |  |  |   }; | 
					
						
							|  |  |  | } |