| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  | # monitoring system services | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  | { | 
					
						
							|  |  |  |   config, | 
					
						
							|  |  |  |   lib, | 
					
						
							|  |  |  |   pkgs, | 
					
						
							|  |  |  |   ... | 
					
						
							|  |  |  | }: | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  | let | 
					
						
							|  |  |  |   cfg = config.my.services.prometheus; | 
					
						
							| 
									
										
										
										
											2023-11-07 23:13:51 +01:00
										 |  |  |   inherit (config.networking) domain; | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  | in | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   options.my.services.prometheus = with lib; { | 
					
						
							|  |  |  |     enable = mkEnableOption "Prometheus for monitoring"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     port = mkOption { | 
					
						
							|  |  |  |       type = types.port; | 
					
						
							|  |  |  |       default = 9090; | 
					
						
							|  |  |  |       example = 3002; | 
					
						
							| 
									
										
										
										
											2023-04-27 23:09:24 +02:00
										 |  |  |       description = "Internal prometheus port"; | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     scrapeInterval = mkOption { | 
					
						
							|  |  |  |       type = types.str; | 
					
						
							|  |  |  |       default = "15s"; | 
					
						
							|  |  |  |       example = "1m"; | 
					
						
							|  |  |  |       description = "Scrape interval"; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     retentionTime = mkOption { | 
					
						
							|  |  |  |       type = types.str; | 
					
						
							|  |  |  |       default = "2y"; | 
					
						
							|  |  |  |       example = "1m"; | 
					
						
							|  |  |  |       description = "retention time"; | 
					
						
							|  |  |  |     }; | 
					
						
							| 
									
										
										
										
											2023-04-27 23:09:24 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-10 23:31:10 +02:00
										 |  |  |     # a good collections for allerts can be found here: https://samber.github.io/awesome-prometheus-alerts/rules#blackbox | 
					
						
							| 
									
										
										
										
											2023-04-27 23:09:24 +02:00
										 |  |  |     rules = mkOption { | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |       type = types.attrsOf ( | 
					
						
							|  |  |  |         types.submodule { | 
					
						
							| 
									
										
										
										
											2023-04-27 23:09:24 +02:00
										 |  |  |           options = { | 
					
						
							|  |  |  |             condition = mkOption { | 
					
						
							|  |  |  |               type = types.str; | 
					
						
							|  |  |  |               description = ''
 | 
					
						
							|  |  |  |                 Prometheus alert expression. | 
					
						
							|  |  |  |               '';
 | 
					
						
							| 
									
										
										
										
											2023-05-21 19:57:54 +02:00
										 |  |  |               example = ''disk_used_percent{mode!="ro"} >= 90''; | 
					
						
							| 
									
										
										
										
											2023-04-27 23:09:24 +02:00
										 |  |  |               default = null; | 
					
						
							|  |  |  |             }; | 
					
						
							|  |  |  |             description = mkOption { | 
					
						
							|  |  |  |               type = types.str; | 
					
						
							|  |  |  |               description = ''
 | 
					
						
							|  |  |  |                 Prometheus alert message. | 
					
						
							|  |  |  |               '';
 | 
					
						
							|  |  |  |               example = "Prometheus encountered value {{ $value }} with {{ $labels }}"; | 
					
						
							|  |  |  |               default = null; | 
					
						
							|  |  |  |             }; | 
					
						
							|  |  |  |             labels = mkOption { | 
					
						
							|  |  |  |               type = types.nullOr (types.attrsOf types.str); | 
					
						
							|  |  |  |               description = ''
 | 
					
						
							|  |  |  |                 Additional alert labels. | 
					
						
							|  |  |  |               '';
 | 
					
						
							|  |  |  |               example = literalExpression ''
 | 
					
						
							|  |  |  |                 { severity = "page" }; | 
					
						
							|  |  |  |               '';
 | 
					
						
							|  |  |  |               default = { }; | 
					
						
							|  |  |  |             }; | 
					
						
							|  |  |  |             time = lib.mkOption { | 
					
						
							|  |  |  |               type = lib.types.str; | 
					
						
							|  |  |  |               description = ''
 | 
					
						
							|  |  |  |                 Time until the alert is fired. | 
					
						
							|  |  |  |               '';
 | 
					
						
							|  |  |  |               example = "5m"; | 
					
						
							|  |  |  |               default = "2m"; | 
					
						
							|  |  |  |             }; | 
					
						
							|  |  |  |           }; | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |       ); | 
					
						
							| 
									
										
										
										
											2023-04-27 23:09:24 +02:00
										 |  |  |       description = ''
 | 
					
						
							|  |  |  |         Defines the prometheus rules. | 
					
						
							|  |  |  |       '';
 | 
					
						
							|  |  |  |       default = { }; | 
					
						
							|  |  |  |     }; | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  |   }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   config = lib.mkIf cfg.enable { | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |     services = { | 
					
						
							|  |  |  |       prometheus = { | 
					
						
							|  |  |  |         enable = true; | 
					
						
							|  |  |  |         webExternalUrl = "https://monitor.${domain}"; | 
					
						
							|  |  |  |         inherit (cfg) port; | 
					
						
							|  |  |  |         listenAddress = "127.0.0.1"; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         inherit (cfg) retentionTime; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         globalConfig = { | 
					
						
							|  |  |  |           scrape_interval = cfg.scrapeInterval; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         ruleFiles = [ | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |           (pkgs.writeText "prometheus-rules.yml" ( | 
					
						
							|  |  |  |             builtins.toJSON { | 
					
						
							|  |  |  |               groups = [ | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                   name = "alerting-rules"; | 
					
						
							|  |  |  |                   rules = lib.mapAttrsToList (name: opts: { | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |                     alert = name; | 
					
						
							|  |  |  |                     expr = opts.condition; | 
					
						
							|  |  |  |                     for = opts.time; | 
					
						
							|  |  |  |                     inherit (opts) labels; | 
					
						
							|  |  |  |                     annotations = { | 
					
						
							|  |  |  |                       inherit (opts) description; | 
					
						
							|  |  |  |                       grafana = lib.optionalString config.services.grafana.enable "https://visualization.${domain}"; | 
					
						
							|  |  |  |                     }; | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |                   }) cfg.rules; | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |               ]; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |           )) | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |         ]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         scrapeConfigs = [ | 
					
						
							|  |  |  |           { | 
					
						
							|  |  |  |             job_name = "prometheus"; | 
					
						
							| 
									
										
										
										
											2024-07-28 21:08:02 +02:00
										 |  |  |             static_configs = [ | 
					
						
							|  |  |  |               { | 
					
						
							|  |  |  |                 targets = [ "127.0.0.1:${toString cfg.port}" ]; | 
					
						
							|  |  |  |                 labels = { | 
					
						
							|  |  |  |                   instance = config.networking.hostName; | 
					
						
							|  |  |  |                 }; | 
					
						
							|  |  |  |               } | 
					
						
							|  |  |  |             ]; | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |           } | 
					
						
							|  |  |  |         ]; | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  |       }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |       grafana.provision = { | 
					
						
							|  |  |  |         datasources.settings.datasources = [ | 
					
						
							|  |  |  |           { | 
					
						
							|  |  |  |             name = "Prometheus"; | 
					
						
							|  |  |  |             type = "prometheus"; | 
					
						
							|  |  |  |             isDefault = true; | 
					
						
							|  |  |  |             url = "http://127.0.0.1:${toString config.services.prometheus.port}"; | 
					
						
							|  |  |  |             jsonData = { | 
					
						
							|  |  |  |               prometheusType = "Prometheus"; | 
					
						
							|  |  |  |               prometheusVersion = toString pkgs.prometheus.version; | 
					
						
							|  |  |  |               timeInterval = config.services.prometheus.globalConfig.scrape_interval; | 
					
						
							| 
									
										
										
										
											2023-04-13 23:50:21 +02:00
										 |  |  |             }; | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |           } | 
					
						
							|  |  |  |         ]; | 
					
						
							|  |  |  |         dashboards.settings.providers = [ | 
					
						
							|  |  |  |           { | 
					
						
							|  |  |  |             name = "Prometheus"; | 
					
						
							|  |  |  |             options.path = pkgs.grafana-dashboards.prometheus; | 
					
						
							|  |  |  |             disableDeletion = true; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         ]; | 
					
						
							|  |  |  |       }; | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |     my.services = { | 
					
						
							|  |  |  |       node-exporter.enable = true; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       prometheus.rules = { | 
					
						
							|  |  |  |         prometheus_too_many_restarts = { | 
					
						
							|  |  |  |           condition = ''changes(process_start_time_seconds{job=~"prometheus|alertmanager"}[15m]) > 2''; | 
					
						
							|  |  |  |           description = "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         alert_manager_config_not_synced = { | 
					
						
							|  |  |  |           condition = ''count(count_values("config_hash", alertmanager_config_hash)) > 1''; | 
					
						
							|  |  |  |           description = "Configurations of AlertManager cluster instances are out of sync"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         prometheus_not_connected_to_alertmanager = { | 
					
						
							|  |  |  |           condition = "prometheus_notifications_alertmanagers_discovered < 1"; | 
					
						
							|  |  |  |           description = "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         prometheus_rule_evaluation_failures = { | 
					
						
							|  |  |  |           condition = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"; | 
					
						
							|  |  |  |           description = "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         prometheus_template_expansion_failures = { | 
					
						
							|  |  |  |           condition = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"; | 
					
						
							|  |  |  |           time = "0m"; | 
					
						
							|  |  |  |           description = "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"; | 
					
						
							|  |  |  |         }; | 
					
						
							|  |  |  |       }; | 
					
						
							| 
									
										
										
										
											2023-06-04 18:08:55 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |       nginx.virtualHosts = [ | 
					
						
							| 
									
										
										
										
											2023-04-16 17:28:23 +02:00
										 |  |  |         { | 
					
						
							| 
									
										
										
										
											2023-11-12 23:36:30 +01:00
										 |  |  |           subdomain = "monitor"; | 
					
						
							|  |  |  |           inherit (cfg) port; | 
					
						
							| 
									
										
										
										
											2023-04-16 17:28:23 +02:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2023-04-09 23:06:38 +02:00
										 |  |  |       ]; | 
					
						
							| 
									
										
										
										
											2024-05-05 13:06:40 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |       backup.exclude = [ "/var/lib/prometheus2/data" ]; | 
					
						
							| 
									
										
										
										
											2023-04-09 23:06:38 +02:00
										 |  |  |     }; | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-27 23:09:59 +02:00
										 |  |  |     webapps.apps = { | 
					
						
							|  |  |  |       prometheus.dashboard = { | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  |         name = "Monitoring"; | 
					
						
							|  |  |  |         category = "infra"; | 
					
						
							|  |  |  |         icon = "heart-pulse"; | 
					
						
							| 
									
										
										
										
											2023-11-12 20:39:44 +01:00
										 |  |  |         url = "https://monitor.${domain}"; | 
					
						
							| 
									
										
										
										
											2023-04-07 23:25:11 +02:00
										 |  |  |         method = "get"; | 
					
						
							| 
									
										
										
										
											2023-04-07 22:39:45 +02:00
										 |  |  |       }; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  |   }; | 
					
						
							|  |  |  | } |