nixos/modules/services/prometheus/default.nix

212 lines
6.1 KiB
Nix
Raw Normal View History

2023-04-07 22:39:45 +02:00
# monitoring system services
2024-07-28 21:08:02 +02:00
{
config,
lib,
pkgs,
...
}:
2023-04-07 22:39:45 +02:00
let
cfg = config.my.services.prometheus;
2023-11-07 23:13:51 +01:00
inherit (config.networking) domain;
2023-04-07 22:39:45 +02:00
in
{
options.my.services.prometheus = with lib; {
enable = mkEnableOption "Prometheus for monitoring";
port = mkOption {
type = types.port;
default = 9090;
example = 3002;
2023-04-27 23:09:24 +02:00
description = "Internal prometheus port";
2023-04-07 22:39:45 +02:00
};
scrapeInterval = mkOption {
type = types.str;
default = "15s";
example = "1m";
description = "Scrape interval";
};
retentionTime = mkOption {
type = types.str;
default = "2y";
example = "1m";
description = "retention time";
};
2023-04-27 23:09:24 +02:00
# a good collections for allerts can be found here: https://samber.github.io/awesome-prometheus-alerts/rules#blackbox
2023-04-27 23:09:24 +02:00
rules = mkOption {
2024-07-28 21:08:02 +02:00
type = types.attrsOf (
types.submodule {
2023-04-27 23:09:24 +02:00
options = {
condition = mkOption {
type = types.str;
description = ''
Prometheus alert expression.
'';
2023-05-21 19:57:54 +02:00
example = ''disk_used_percent{mode!="ro"} >= 90'';
2023-04-27 23:09:24 +02:00
default = null;
};
description = mkOption {
type = types.str;
description = ''
Prometheus alert message.
'';
example = "Prometheus encountered value {{ $value }} with {{ $labels }}";
default = null;
};
labels = mkOption {
type = types.nullOr (types.attrsOf types.str);
description = ''
Additional alert labels.
'';
example = literalExpression ''
{ severity = "page" };
'';
default = { };
};
time = lib.mkOption {
type = lib.types.str;
description = ''
Time until the alert is fired.
'';
example = "5m";
default = "2m";
};
};
2024-07-28 21:08:02 +02:00
}
);
2023-04-27 23:09:24 +02:00
description = ''
Defines the prometheus rules.
'';
default = { };
};
2023-04-07 22:39:45 +02:00
};
config = lib.mkIf cfg.enable {
services = {
prometheus = {
enable = true;
webExternalUrl = "https://monitor.${domain}";
inherit (cfg) port;
inherit (cfg) retentionTime;
globalConfig = {
scrape_interval = cfg.scrapeInterval;
};
ruleFiles = [
2024-07-28 21:08:02 +02:00
(pkgs.writeText "prometheus-rules.yml" (
builtins.toJSON {
groups = [
{
name = "alerting-rules";
rules = lib.mapAttrsToList (name: opts: {
alert = name;
expr = opts.condition;
for = opts.time;
inherit (opts) labels;
annotations = {
inherit (opts) description;
grafana = lib.optionalString config.services.grafana.enable "https://visualization.${domain}";
};
2024-07-28 21:08:02 +02:00
}) cfg.rules;
}
];
}
))
];
scrapeConfigs = [
{
job_name = "prometheus";
2024-07-28 21:08:02 +02:00
static_configs = [
{
targets = [ "localhost:${toString cfg.port}" ];
2024-07-28 21:08:02 +02:00
labels = {
instance = config.networking.hostName;
};
}
];
}
];
2023-04-07 22:39:45 +02:00
};
grafana.provision = {
datasources.settings.datasources = [
{
name = "Prometheus";
type = "prometheus";
isDefault = true;
url = "http://localhost:${toString config.services.prometheus.port}";
jsonData = {
prometheusType = "Prometheus";
prometheusVersion = toString pkgs.prometheus.version;
timeInterval = config.services.prometheus.globalConfig.scrape_interval;
};
}
];
dashboards.settings.providers = [
{
name = "Prometheus";
options.path = pkgs.grafana-dashboards.prometheus;
disableDeletion = true;
}
];
};
2023-04-07 22:39:45 +02:00
};
my.services = {
node-exporter.enable = true;
prometheus.rules = {
prometheus_too_many_restarts = {
condition = ''changes(process_start_time_seconds{job=~"prometheus|alertmanager"}[15m]) > 2'';
description = "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping";
};
alert_manager_config_not_synced = {
condition = ''count(count_values("config_hash", alertmanager_config_hash)) > 1'';
description = "Configurations of AlertManager cluster instances are out of sync";
};
prometheus_not_connected_to_alertmanager = {
condition = "prometheus_notifications_alertmanagers_discovered < 1";
description = "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
};
prometheus_rule_evaluation_failures = {
condition = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0";
description = "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
};
prometheus_template_expansion_failures = {
condition = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0";
time = "0m";
description = "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
};
};
nginx.virtualHosts = [
2023-04-16 17:28:23 +02:00
{
subdomain = "monitor";
inherit (cfg) port;
2023-04-16 17:28:23 +02:00
}
];
backup.exclude = [ "/var/lib/prometheus2/data" ];
};
2023-04-07 22:39:45 +02:00
2023-04-27 23:09:59 +02:00
webapps.apps = {
prometheus.dashboard = {
2023-04-07 22:39:45 +02:00
name = "Monitoring";
category = "infra";
icon = "heart-pulse";
2023-11-12 20:39:44 +01:00
url = "https://monitor.${domain}";
2023-04-07 23:25:11 +02:00
method = "get";
2023-04-07 22:39:45 +02:00
};
};
};
}