{ config, lib, pkgs, ... }: let cfg = config.services.prometheus; mkStaticTargets = targets: lib.singleton { inherit targets; }; mkStaticTarget = target: mkStaticTargets (lib.singleton target); in { systemd.mounts = lib.singleton { after = [ "data.mount" ]; before = [ "prometheus.service" ]; wantedBy = [ "local-fs.target" ]; what = "/data/prometheus"; where = config.systemd.services.prometheus.serviceConfig.WorkingDirectory; type = "none"; options = "bind"; }; services.prometheus = { enable = true; listenAddress = "127.0.0.1"; globalConfig = { scrape_interval = "15s"; evaluation_interval = "15s"; }; alertmanagers = [ { static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}"; } ]; alertmanager = { enable = true; listenAddress = "127.0.0.1"; configuration = { global.resolve_timeout = "2m"; route = { receiver = "matrix"; group_by = [ "alertname" ]; group_wait = "3m"; }; receivers = [ { name = "matrix"; webhook_configs = [ # FIXME: Add correct URL { url = "http://matrix-alertmanager:3000/alerts"; } ]; } ]; }; }; scrapeConfigs = [ { job_name = "prometheus"; static_configs = mkStaticTarget "localhost:${toString cfg.port}"; } { job_name = "node"; static_configs = mkStaticTargets [ "fuuko.vpn.sbruder.de:9100" "issei.vpn.sbruder.de:9100" "nunotaba.vpn.sbruder.de:9100" "sayuri.vpn.sbruder.de:9100" "vueko.vpn.sbruder.de:9100" ]; } ]; rules = let mkAlert = { name, expr, for ? "1m", description ? null }: { alert = name; inherit expr for; annotations = lib.optionalAttrs (description != null) { inherit description; }; }; in [ (lib.generators.toYAML { } { groups = lib.singleton { name = "alert.rules"; rules = map mkAlert [ { name = "InstanceDown"; expr = ''up{instance!~"(nunotaba|sayuri).vpn.sbruder.de:.*"} == 0''; description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."; } { name = "SystemdUnitFailed"; expr = ''node_systemd_unit_state{state="failed"} == 1''; description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed."; } { name = "NodeHighLoad"; expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2''; for = "15m"; description = "This node is having a per-core load ≥ 2 for the last 15 minutes."; } { name = "NodeHighMemory"; expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9''; for = "2m"; description = "This node is using more than 90 % of available RAM."; } { name = "TP440ACPIBroken"; expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48''; for = "10m"; description = "Thinkpad T440’s ACPI temperature is broken. Its reported temperature is 48 °C for the last 10 minutes. That doesn’t seem right. Try suspending"; } ]; }; }) ]; }; krops.secrets.prometheus-htpasswd = { group = "nginx"; }; services.nginx.virtualHosts."prometheus.sbruder.de" = { enableACME = true; forceSSL = true; basicAuthFile = config.krops.secrets.prometheus-htpasswd.path; locations = { "/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}"; "/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}/"; }; }; }