nixos/modules/services/prometheus/default.nix

# monitoring system services
{
  config,
  lib,
  pkgs,
  ...
}:
let
  cfg = config.my.services.prometheus;
  inherit (config.networking) domain;
in
{
  options.my.services.prometheus = with lib; {
    enable = mkEnableOption "Prometheus for monitoring";

    port = mkOption {
      type = types.port;
      default = 9090;
      example = 3002;
      description = "Internal prometheus port";
    };

    scrapeInterval = mkOption {
      type = types.str;
      default = "15s";
      example = "1m";
      description = "Scrape interval";
    };

    retentionTime = mkOption {
      type = types.str;
      default = "2y";
      example = "1m";
      description = "retention time";
    };

    # a good collections for allerts can be found here: https://samber.github.io/awesome-prometheus-alerts/rules#blackbox
    rules = mkOption {
      type = types.attrsOf (
        types.submodule {
          options = {
            condition = mkOption {
              type = types.str;
              description = ''
                Prometheus alert expression.
              '';
              example = ''disk_used_percent{mode!="ro"} >= 90'';
              default = null;
            };
            description = mkOption {
              type = types.str;
              description = ''
                Prometheus alert message.
              '';
              example = "Prometheus encountered value {{ $value }} with {{ $labels }}";
              default = null;
            };
            labels = mkOption {
              type = types.nullOr (types.attrsOf types.str);
              description = ''
                Additional alert labels.
              '';
              example = literalExpression ''
                { severity = "page" };
              '';
              default = { };
            };
            time = lib.mkOption {
              type = lib.types.str;
              description = ''
                Time until the alert is fired.
              '';
              example = "5m";
              default = "2m";
            };
          };
        }
      );
      description = ''
        Defines the prometheus rules.
      '';
      default = { };
    };
  };

  config = lib.mkIf cfg.enable {
    services = {
      prometheus = {
        enable = true;
        webExternalUrl = "https://monitor.${domain}";
        inherit (cfg) port;

        inherit (cfg) retentionTime;

        globalConfig = {
          scrape_interval = cfg.scrapeInterval;
        };

        ruleFiles = [
          (pkgs.writeText "prometheus-rules.yml" (
            builtins.toJSON {
              groups = [
                {
                  name = "alerting-rules";
                  rules = lib.mapAttrsToList (name: opts: {
                    alert = name;
                    expr = opts.condition;
                    for = opts.time;
                    inherit (opts) labels;
                    annotations = {
                      inherit (opts) description;
                      grafana = lib.optionalString config.services.grafana.enable "https://visualization.${domain}";
                    };
                  }) cfg.rules;
                }
              ];
            }
          ))
        ];

        scrapeConfigs = [
          {
            job_name = "prometheus";
            static_configs = [
              {
                targets = [ "localhost:${toString cfg.port}" ];
                labels = {
                  instance = config.networking.hostName;
                };
              }
            ];
          }
        ];
      };

      grafana.provision = {
        datasources.settings.datasources = [
          {
            name = "Prometheus";
            type = "prometheus";
            isDefault = true;
            url = "http://localhost:${toString config.services.prometheus.port}";
            jsonData = {
              prometheusType = "Prometheus";
              prometheusVersion = toString pkgs.prometheus.version;
              timeInterval = config.services.prometheus.globalConfig.scrape_interval;
            };
          }
        ];
        dashboards.settings.providers = [
          {
            name = "Prometheus";
            options.path = pkgs.grafana-dashboards.prometheus;
            disableDeletion = true;
          }
        ];
      };
    };

    my.services = {
      node-exporter.enable = true;

      prometheus.rules = {
        prometheus_too_many_restarts = {
          condition = ''changes(process_start_time_seconds{job=~"prometheus|alertmanager"}[15m]) > 2'';
          description = "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping";
        };

        alert_manager_config_not_synced = {
          condition = ''count(count_values("config_hash", alertmanager_config_hash)) > 1'';
          description = "Configurations of AlertManager cluster instances are out of sync";
        };

        prometheus_not_connected_to_alertmanager = {
          condition = "prometheus_notifications_alertmanagers_discovered < 1";
          description = "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}";
        };

        prometheus_rule_evaluation_failures = {
          condition = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0";
          description = "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}";
        };

        prometheus_template_expansion_failures = {
          condition = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0";
          time = "0m";
          description = "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}";
        };
      };

      nginx.virtualHosts = [
        {
          subdomain = "monitor";
          inherit (cfg) port;
        }
      ];

      backup.exclude = [ "/var/lib/prometheus2/data" ];
    };

    webapps.apps = {
      prometheus.dashboard = {
        name = "Monitoring";
        category = "infra";
        icon = "heart-pulse";
        url = "https://monitor.${domain}";
        method = "get";
      };
    };
  };
}
service/prometheus: init 2023-04-07 22:39:45 +02:00			`# monitoring system services`
treewide: fmt 2024-07-28 21:08:02 +02:00			`{`
			`config,`
			`lib,`
			`pkgs,`
			`...`
			`}:`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`let`
			`cfg = config.my.services.prometheus;`
treewide: fix statix errors 2023-11-07 23:13:51 +01:00			`inherit (config.networking) domain;`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`in`
			`{`
			`options.my.services.prometheus = with lib; {`
			`enable = mkEnableOption "Prometheus for monitoring";`

			`port = mkOption {`
			`type = types.port;`
			`default = 9090;`
			`example = 3002;`
service/prometheus: add rules option 2023-04-27 23:09:24 +02:00			`description = "Internal prometheus port";`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`};`

			`scrapeInterval = mkOption {`
			`type = types.str;`
			`default = "15s";`
			`example = "1m";`
			`description = "Scrape interval";`
			`};`

			`retentionTime = mkOption {`
			`type = types.str;`
			`default = "2y";`
			`example = "1m";`
			`description = "retention time";`
			`};`
service/prometheus: add rules option 2023-04-27 23:09:24 +02:00
service/prometheus: add source-url of alerts 2023-07-10 23:31:10 +02:00			`# a good collections for allerts can be found here: https://samber.github.io/awesome-prometheus-alerts/rules#blackbox`
service/prometheus: add rules option 2023-04-27 23:09:24 +02:00			`rules = mkOption {`
treewide: fmt 2024-07-28 21:08:02 +02:00			`type = types.attrsOf (`
			`types.submodule {`
service/prometheus: add rules option 2023-04-27 23:09:24 +02:00			`options = {`
			`condition = mkOption {`
			`type = types.str;`
			`description = ''`
			`Prometheus alert expression.`
			`'';`
service/prometheus: update example 2023-05-21 19:57:54 +02:00			`example = ''disk_used_percent{mode!="ro"} >= 90'';`
service/prometheus: add rules option 2023-04-27 23:09:24 +02:00			`default = null;`
			`};`
			`description = mkOption {`
			`type = types.str;`
			`description = ''`
			`Prometheus alert message.`
			`'';`
			`example = "Prometheus encountered value {{ $value }} with {{ $labels }}";`
			`default = null;`
			`};`
			`labels = mkOption {`
			`type = types.nullOr (types.attrsOf types.str);`
			`description = ''`
			`Additional alert labels.`
			`'';`
			`example = literalExpression ''`
			`{ severity = "page" };`
			`'';`
			`default = { };`
			`};`
			`time = lib.mkOption {`
			`type = lib.types.str;`
			`description = ''`
			`Time until the alert is fired.`
			`'';`
			`example = "5m";`
			`default = "2m";`
			`};`
			`};`
treewide: fmt 2024-07-28 21:08:02 +02:00			`}`
			`);`
service/prometheus: add rules option 2023-04-27 23:09:24 +02:00			`description = ''`
			`Defines the prometheus rules.`
			`'';`
			`default = { };`
			`};`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`};`

			`config = lib.mkIf cfg.enable {`
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`services = {`
			`prometheus = {`
			`enable = true;`
			`webExternalUrl = "https://monitor.${domain}";`
			`inherit (cfg) port;`

			`inherit (cfg) retentionTime;`

			`globalConfig = {`
			`scrape_interval = cfg.scrapeInterval;`
			`};`

			`ruleFiles = [`
treewide: fmt 2024-07-28 21:08:02 +02:00			`(pkgs.writeText "prometheus-rules.yml" (`
			`builtins.toJSON {`
			`groups = [`
			`{`
			`name = "alerting-rules";`
			`rules = lib.mapAttrsToList (name: opts: {`
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`alert = name;`
			`expr = opts.condition;`
			`for = opts.time;`
			`inherit (opts) labels;`
			`annotations = {`
			`inherit (opts) description;`
			`grafana = lib.optionalString config.services.grafana.enable "https://visualization.${domain}";`
			`};`
treewide: fmt 2024-07-28 21:08:02 +02:00			`}) cfg.rules;`
			`}`
			`];`
			`}`
			`))`
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`];`

			`scrapeConfigs = [`
			`{`
			`job_name = "prometheus";`
treewide: fmt 2024-07-28 21:08:02 +02:00			`static_configs = [`
			`{`
treewide: use localhost instead of ipv4 2024-12-27 20:30:21 +01:00			`targets = [ "localhost:${toString cfg.port}" ];`
treewide: fmt 2024-07-28 21:08:02 +02:00			`labels = {`
			`instance = config.networking.hostName;`
			`};`
			`}`
			`];`
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`}`
			`];`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`};`

treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`grafana.provision = {`
			`datasources.settings.datasources = [`
			`{`
			`name = "Prometheus";`
			`type = "prometheus";`
			`isDefault = true;`
treewide: use localhost instead of ipv4 2024-12-27 20:30:21 +01:00			`url = "http://localhost:${toString config.services.prometheus.port}";`
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`jsonData = {`
			`prometheusType = "Prometheus";`
			`prometheusVersion = toString pkgs.prometheus.version;`
			`timeInterval = config.services.prometheus.globalConfig.scrape_interval;`
service/prometheus: enable systemd target and make datasource default 2023-04-13 23:50:21 +02:00			`};`
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`}`
			`];`
			`dashboards.settings.providers = [`
			`{`
			`name = "Prometheus";`
			`options.path = pkgs.grafana-dashboards.prometheus;`
			`disableDeletion = true;`
			`}`
			`];`
			`};`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`};`

treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`my.services = {`
			`node-exporter.enable = true;`

			`prometheus.rules = {`
			`prometheus_too_many_restarts = {`
			`condition = ''changes(process_start_time_seconds{job=~"prometheus\|alertmanager"}[15m]) > 2'';`
			`description = "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping";`
			`};`

			`alert_manager_config_not_synced = {`
			`condition = ''count(count_values("config_hash", alertmanager_config_hash)) > 1'';`
			`description = "Configurations of AlertManager cluster instances are out of sync";`
			`};`

			`prometheus_not_connected_to_alertmanager = {`
			`condition = "prometheus_notifications_alertmanagers_discovered < 1";`
			`description = "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";`
			`};`

			`prometheus_rule_evaluation_failures = {`
			`condition = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0";`
			`description = "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";`
			`};`

			`prometheus_template_expansion_failures = {`
			`condition = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0";`
			`time = "0m";`
			`description = "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";`
			`};`
			`};`
service/node-exporter: seperate from prometheus 2023-06-04 18:08:55 +02:00
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`nginx.virtualHosts = [`
service/prometheus: add monitoring 2023-04-16 17:28:23 +02:00			`{`
treewide: avoid repetitive keys in attrSets 2023-11-12 23:36:30 +01:00			`subdomain = "monitor";`
			`inherit (cfg) port;`
service/prometheus: add monitoring 2023-04-16 17:28:23 +02:00			`}`
service/prometheus: move datasource config from grafana 2023-04-09 23:06:38 +02:00			`];`
service/prometheus: exclude data from backup 2024-05-05 13:06:40 +02:00
			`backup.exclude = [ "/var/lib/prometheus2/data" ];`
service/prometheus: move datasource config from grafana 2023-04-09 23:06:38 +02:00			`};`
service/prometheus: init 2023-04-07 22:39:45 +02:00
service/prometheus: update url 2023-04-27 23:09:59 +02:00			`webapps.apps = {`
			`prometheus.dashboard = {`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`name = "Monitoring";`
			`category = "infra";`
			`icon = "heart-pulse";`
service/homer: rename link to url 2023-11-12 20:39:44 +01:00			`url = "https://monitor.${domain}";`
service/prometheus: fix homer ping 2023-04-07 23:25:11 +02:00			`method = "get";`
service/prometheus: init 2023-04-07 22:39:45 +02:00			`};`
			`};`
			`};`
			`}`