{ config, lib, pkgs, ... }: let cfg = config.services.prometheus; mkStaticTargets = targets: lib.singleton { inherit targets; }; mkStaticTarget = target: mkStaticTargets (lib.singleton target); in { systemd.mounts = lib.singleton { after = [ "data.mount" ]; before = [ "prometheus.service" ]; wantedBy = [ "local-fs.target" ]; what = "/data/prometheus"; where = config.systemd.services.prometheus.serviceConfig.WorkingDirectory; type = "none"; options = "bind"; }; services.prometheus = { enable = true; listenAddress = "127.0.0.1"; webExternalUrl = "https://prometheus.sbruder.de"; globalConfig = { scrape_interval = "15s"; evaluation_interval = "15s"; }; extraFlags = [ "--storage.tsdb.retention.time=90d" "--web.enable-admin-api" ]; alertmanagers = [ { static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}"; path_prefix = "/alertmanager/"; } ]; alertmanager = { enable = true; listenAddress = "127.0.0.1"; webExternalUrl = "https://prometheus.sbruder.de/alertmanager"; configuration = { global.resolve_timeout = "2m"; route = { receiver = "matrix"; group_by = [ "alertname" ]; group_wait = "3m"; }; receivers = [ { name = "matrix"; webhook_configs = lib.singleton { url = (lib.elemAt (lib.filter ({ ID, ... }: ID == "alertmanager_service") config.services.go-neb.config.services) 0).Config.webhook_url; }; } ]; }; }; scrapeConfigs = [ { job_name = "prometheus"; static_configs = mkStaticTarget "localhost:${toString cfg.port}"; } { job_name = "node"; static_configs = mkStaticTargets [ "fuuko.vpn.sbruder.de:9100" "nunotaba.vpn.sbruder.de:9100" "sayuri.vpn.sbruder.de:9100" "vueko.vpn.sbruder.de:9100" ]; } { job_name = "aria2"; static_configs = mkStaticTarget "127.0.0.1:9578"; relabel_configs = lib.singleton { target_label = "instance"; replacement = "torrent.sbruder.de"; }; } { job_name = "fritzbox"; static_configs = mkStaticTarget "127.0.0.1:9133"; } ( let listenerCfg = (lib.elemAt config.services.matrix-synapse.listeners 0); in { job_name = "synapse"; static_configs = mkStaticTarget "${listenerCfg.bind_address}:${toString listenerCfg.port}"; metrics_path = "/_synapse/metrics"; relabel_configs = lib.singleton { target_label = "instance"; replacement = "matrix.sbruder.de"; }; } ) { job_name = "drone"; static_configs = mkStaticTarget config.systemd.services.drone-server.environment.DRONE_SERVER_PORT; relabel_configs = lib.singleton { target_label = "instance"; replacement = "ci.sbruder.de"; }; } ]; rules = let mkAlert = { name, expr, for ? "1m", description ? null }: { alert = name; inherit expr for; annotations = lib.optionalAttrs (description != null) { inherit description; }; }; in [ (lib.generators.toYAML { } { groups = lib.singleton { name = "alert.rules"; rules = map mkAlert [ { name = "InstanceDown"; expr = ''up{instance!~"(nunotaba|sayuri).vpn.sbruder.de:.*"} == 0''; description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."; } { name = "SystemdUnitFailed"; expr = ''node_systemd_unit_state{state="failed"} == 1''; description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed."; } { name = "NodeHighLoad"; expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2''; for = "15m"; description = "This node is having a per-core load ≥ 2 for the last 15 minutes."; } { name = "NodeHighMemory"; expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9''; for = "2m"; description = "This node is using more than 90 % of available RAM."; } { name = "TP440ACPIBroken"; expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48''; for = "10m"; description = "Thinkpad T440’s ACPI temperature is broken. Its reported temperature is 48 °C for the last 10 minutes. That doesn’t seem right. Try suspending"; } { name = "TorrentNoPeers"; expr = "sum by (instance) (aria2_torrent_peers) == 0"; description = "Aria2 instance {{ $labels.instance }} has no peers. There might be a network connectivity problem"; } ]; }; }) ]; exporters = { fritzbox = { enable = true; gatewayAddress = "192.168.100.1"; listenAddress = "127.0.0.1"; }; }; }; # get rid of “could not call action: authorization required” every scrape systemd.services.prometheus-fritzbox-exporter.serviceConfig.StandardOutput = "null"; krops.secrets.prometheus-htpasswd = { group = "nginx"; }; services.nginx.virtualHosts."prometheus.sbruder.de" = { enableACME = true; forceSSL = true; basicAuthFile = config.krops.secrets.prometheus-htpasswd.path; locations = { "/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}"; "/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}"; }; }; }