From f388995ef6be2c64440019aead973eb5382df9aa Mon Sep 17 00:00:00 2001 From: Simon Bruder Date: Wed, 24 Feb 2021 20:55:07 +0100 Subject: [PATCH] fuuko: Add prometheus --- machines/fuuko/configuration.nix | 3 + machines/fuuko/services/prometheus.nix | 134 +++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 machines/fuuko/services/prometheus.nix diff --git a/machines/fuuko/configuration.nix b/machines/fuuko/configuration.nix index 1a329b2..1c7cf10 100644 --- a/machines/fuuko/configuration.nix +++ b/machines/fuuko/configuration.nix @@ -4,6 +4,8 @@ ./hardware-configuration.nix ../../modules ../../users/simon + + ./services/prometheus.nix ]; sbruder = { @@ -25,6 +27,7 @@ }; }; networking.firewall.allowedTCPPorts = [ 80 443 ]; + users.users.nginx.extraGroups = [ "keys" ]; networking.hostName = "fuuko"; diff --git a/machines/fuuko/services/prometheus.nix b/machines/fuuko/services/prometheus.nix new file mode 100644 index 0000000..e664add --- /dev/null +++ b/machines/fuuko/services/prometheus.nix @@ -0,0 +1,134 @@ +{ config, lib, pkgs, ... }: +let + cfg = config.services.prometheus; + + mkStaticTargets = targets: lib.singleton { inherit targets; }; + mkStaticTarget = target: mkStaticTargets (lib.singleton target); +in +{ + systemd.mounts = lib.singleton { + after = [ "data.mount" ]; + before = [ "prometheus.service" ]; + wantedBy = [ "local-fs.target" ]; + what = "/data/prometheus"; + where = config.systemd.services.prometheus.serviceConfig.WorkingDirectory; + type = "none"; + options = "bind"; + }; + + services.prometheus = { + enable = true; + listenAddress = "127.0.0.1"; + globalConfig = { + scrape_interval = "15s"; + evaluation_interval = "15s"; + }; + + alertmanagers = [ + { static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}"; } + ]; + alertmanager = { + enable = true; + listenAddress = "127.0.0.1"; + configuration = { + global.resolve_timeout = "2m"; + + route = { + receiver = "matrix"; + group_by = [ "alertname" ]; + group_wait = "3m"; + }; + + receivers = [ + { + name = "matrix"; + webhook_configs = [ + # FIXME: Add correct URL + { url = "http://matrix-alertmanager:3000/alerts"; } + ]; + } + ]; + }; + }; + + scrapeConfigs = [ + { + job_name = "prometheus"; + static_configs = mkStaticTarget "localhost:${toString cfg.port}"; + } + { + job_name = "node"; + static_configs = mkStaticTargets [ + "fuuko.vpn.sbruder.de:9100" + "issei.vpn.sbruder.de:9100" + "nunotaba.vpn.sbruder.de:9100" + "sayuri.vpn.sbruder.de:9100" + "vueko.vpn.sbruder.de:9100" + ]; + } + ]; + + rules = + let + mkAlert = { name, expr, for ? "1m", description ? null }: { + alert = name; + inherit expr for; + annotations = lib.optionalAttrs (description != null) { inherit description; }; + }; + in + [ + (lib.generators.toYAML { } { + groups = lib.singleton { + name = "alert.rules"; + rules = map mkAlert [ + { + name = "InstanceDown"; + expr = ''up{instance!~"(nunotaba|sayuri).vpn.sbruder.de:.*"} == 0''; + description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."; + } + { + name = "SystemdUnitFailed"; + expr = ''node_systemd_unit_state{state="failed"} == 1''; + description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed."; + } + { + name = "NodeHighLoad"; + expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2''; + for = "15m"; + description = "This node is having a per-core load ≥ 2 for the last 15 minutes."; + } + { + name = "NodeHighMemory"; + expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9''; + for = "2m"; + description = "This node is using more than 90 % of available RAM."; + } + { + name = "TP440ACPIBroken"; + expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48''; + for = "10m"; + description = "Thinkpad T440’s ACPI temperature is broken. Its reported temperature is 48 °C for the last 10 minutes. That doesn’t seem right. Try suspending"; + } + ]; + }; + }) + ]; + }; + + krops.secrets.prometheus-htpasswd = { + group = "nginx"; + }; + + services.nginx.virtualHosts."prometheus.sbruder.de" = { + enableACME = true; + forceSSL = true; + + basicAuthFile = config.krops.secrets.prometheus-htpasswd.path; + + locations = { + "/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}"; + + "/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}/"; + }; + }; +}