# SPDX-FileCopyrightText: 2021-2024 Simon Bruder # # SPDX-License-Identifier: AGPL-3.0-or-later { config, lib, pkgs, ... }: let cfg = config.services.prometheus; mkStaticTargets = targets: lib.singleton { inherit targets; }; mkStaticTarget = target: mkStaticTargets (lib.singleton target); in { services.prometheus = { enable = true; listenAddress = "127.0.0.1"; webExternalUrl = "https://prometheus.sbruder.de"; globalConfig = { scrape_interval = "15s"; evaluation_interval = "15s"; }; extraFlags = [ "--storage.tsdb.retention.time=90d" "--web.enable-admin-api" ]; alertmanagers = [ { static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}"; path_prefix = "/alertmanager/"; } ]; alertmanager = { enable = true; listenAddress = "127.0.0.1"; webExternalUrl = "https://prometheus.sbruder.de/alertmanager"; configuration = { global.resolve_timeout = "2m"; route = { receiver = "matrix"; group_by = [ "alertname" ]; group_wait = "3m"; }; receivers = [ { name = "matrix"; webhook_configs = lib.singleton { url = (lib.elemAt (lib.filter ({ ID, ... }: ID == "alertmanager_service") config.services.go-neb.config.services) 0).Config.webhook_url; }; } ]; }; }; scrapeConfigs = [ { job_name = "prometheus"; static_configs = mkStaticTarget "localhost:${toString cfg.port}"; } { job_name = "node"; static_configs = mkStaticTargets [ "fuuko.vpn.sbruder.de:9100" "mayushii.vpn.sbruder.de:9100" "nunotaba.vpn.sbruder.de:9100" "renge.vpn.sbruder.de:9100" "hitagi.vpn.sbruder.de:9100" "vueko.vpn.sbruder.de:9100" "okarin.vpn.sbruder.de:9100" "shinobu.vpn.sbruder.de:9100" "nazuna.vpn.sbruder.de:9100" "yuzuru.vpn.sbruder.de:9100" "koyomi.vpn.sbruder.de:9100" ]; relabel_configs = lib.singleton { target_label = "instance"; source_labels = lib.singleton "__address__"; regex = "(.*)\\.vpn\\.sbruder\\.de:9100"; }; } { job_name = "smartctl"; static_configs = mkStaticTargets [ "fuuko.vpn.sbruder.de:9633" "mayushii.vpn.sbruder.de:9633" "nunotaba.vpn.sbruder.de:9633" "hitagi.vpn.sbruder.de:9633" "shinobu.vpn.sbruder.de:9633" "koyomi.vpn.sbruder.de:9633" ]; relabel_configs = lib.singleton { target_label = "instance"; source_labels = lib.singleton "__address__"; regex = "(.*)\\.vpn\\.sbruder\\.de:9633"; }; } { job_name = "qbittorrent"; static_configs = mkStaticTargets [ "fuuko.vpn.sbruder.de:9561" "nazuna.vpn.sbruder.de:9561" ]; relabel_configs = lib.singleton { target_label = "instance"; source_labels = lib.singleton "__address__"; regex = "(.*)\\.vpn\\.sbruder\\.de:9561"; }; } ( let listenerCfg = (lib.elemAt config.services.matrix-synapse.settings.listeners 0); in { job_name = "synapse"; static_configs = mkStaticTarget "${lib.elemAt listenerCfg.bind_addresses 0}:${toString listenerCfg.port}"; metrics_path = "/_synapse/metrics"; relabel_configs = lib.singleton { target_label = "instance"; replacement = "matrix.sbruder.de"; }; } ) { job_name = "dnsmasq"; static_configs = mkStaticTarget "shinobu.vpn.sbruder.de:${toString config.services.prometheus.exporters.dnsmasq.port}"; relabel_configs = lib.singleton { target_label = "instance"; replacement = "shinobu"; }; } { job_name = "hcloud"; static_configs = mkStaticTarget config.services.hcloud_exporter.listenAddress; } { job_name = "co2"; static_configs = mkStaticTarget "shinobu.vpn.sbruder.de:9672"; } { job_name = "rspamd"; static_configs = mkStaticTarget "vueko.vpn.sbruder.de"; metrics_path = "/rspamd/metrics"; relabel_configs = lib.singleton { target_label = "instance"; replacement = "vueko.sbruder.de"; }; } { job_name = "knot"; static_configs = mkStaticTargets [ "vueko.vpn.sbruder.de:9433" "renge.vpn.sbruder.de:9433" "okarin.vpn.sbruder.de:9433" "yuzuru.vpn.sbruder.de:9433" ]; relabel_configs = lib.singleton { target_label = "instance"; source_labels = lib.singleton "__address__"; regex = "(.*)\\.vpn\\.sbruder\\.de:9433"; }; } { job_name = "snmp"; metrics_path = "/snmp"; params = { module = [ "if_mib" ]; }; static_configs = mkStaticTargets [ "karibik.management.shinonome-lab.de" ]; relabel_configs = [ { source_labels = lib.singleton "__address__"; target_label = "__param_target"; } { source_labels = lib.singleton "__param_target"; target_label = "instance"; } { target_label = "__address__"; replacement = "shinobu.vpn.sbruder.de:9116"; } ]; } { job_name = "haproxy"; static_configs = mkStaticTargets [ "koyomi.vpn.sbruder.de:8404" ]; relabel_configs = lib.singleton { target_label = "instance"; source_labels = lib.singleton "__address__"; regex = "(.*)\\.vpn\\.sbruder\\.de:8404"; }; } ]; rules = let mkAlert = { name, expr, for ? "1m", description ? null }: { alert = name; inherit expr for; annotations = lib.optionalAttrs (description != null) { inherit description; }; }; in [ (lib.generators.toYAML { } { groups = lib.singleton { name = "alert.rules"; rules = map mkAlert [ { name = "InstanceDown"; expr = ''up{instance!~"(nunotaba|hitagi|mayushii|fuuko)"} == 0''; description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."; } { name = "SystemdUnitFailed"; expr = ''node_systemd_unit_state{state="failed"} == 1''; description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed."; } { name = "NodeHighLoad"; expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2''; for = "15m"; description = "Node {{ $labels.instance }} is having a per-core load ≥ 2 for the last 15 minutes."; } { name = "NodeHighMemory"; expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9''; for = "2m"; description = "Node {{ $labels.instance }} is using more than 90 % of available RAM."; } { name = "TP440ACPIBroken"; expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48''; for = "10m"; description = "Thinkpad T440’s ACPI temperature is broken. Its reported temperature is 48 °C for the last 10 minutes. That doesn’t seem right. Try suspending"; } { name = "TorrentNoPeers"; expr = "sum by (instance) (qBittorrent_torrent_connected_leechs) == 0"; description = "qBittorrent instance {{ $labels.instance }} has no peers. There might be a network connectivity problem"; } # <40% is to account for /boot being full (which causes ugly errors on rebuild) { name = "DiskFull"; expr = ''node_filesystem_free_bytes{fstype!~"ramfs|tmpfs", mountpoint!~"/nix/store"} / node_filesystem_size_bytes{fstype!~"ramfs|tmpfs", mountpoint!~"/nix/store"} < 0.4 and node_filesystem_free_bytes{fstype!~"ramfs|tmpfs", mountpoint!~"/nix/store"} < 4*1024^3''; description = "Device {{ $labels.device }} on {{ $labels.instance }}:{{ $labels.mountpoint }} has less than 4GiB free space while being used over 40%"; } ]; }; }) ]; }; # exporters that are not part of nixpkgs’ prometheus infrastructure services.hcloud_exporter = { enable = true; listenAddress = "127.0.0.1:9501"; environmentFile = config.sops.secrets.hcloud_exporter-environment.path; }; sops.secrets.hcloud_exporter-environment.sopsFile = ../secrets.yaml; sops.secrets.prometheus-htpasswd = { owner = "nginx"; sopsFile = ../secrets.yaml; }; services.nginx.virtualHosts."prometheus.sbruder.de" = { enableACME = true; forceSSL = true; basicAuthFile = config.sops.secrets.prometheus-htpasswd.path; locations = { "/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}"; "/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}"; }; }; }