nixos-config/machines/renge/services/prometheus.nix
Simon Bruder 444aa962b5
fuuko: Add router service
This now mostly replaces ayu, which has been sitting between upstream
and the switch the last 6 months.

It now also configures routing over Mullvad, which also finally enables
IPv6 (I don’t understand how people get away with not providing it
natively in 2023).

Once I get a WLAN card, it will also host its own access point, which
currently still relies on ayu as “dumb AP” (as OpenWRT calls it).
2023-04-02 01:14:01 +02:00

193 lines
6.4 KiB
Nix
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{ config, lib, pkgs, ... }:
let
cfg = config.services.prometheus;
mkStaticTargets = targets: lib.singleton { inherit targets; };
mkStaticTarget = target: mkStaticTargets (lib.singleton target);
in
{
services.prometheus = {
enable = true;
listenAddress = "127.0.0.1";
webExternalUrl = "https://prometheus.sbruder.de";
globalConfig = {
scrape_interval = "15s";
evaluation_interval = "15s";
};
extraFlags = [
"--storage.tsdb.retention.time=90d"
"--web.enable-admin-api"
];
alertmanagers = [
{
static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
path_prefix = "/alertmanager/";
}
];
alertmanager = {
enable = true;
listenAddress = "127.0.0.1";
webExternalUrl = "https://prometheus.sbruder.de/alertmanager";
configuration = {
global.resolve_timeout = "2m";
route = {
receiver = "matrix";
group_by = [ "alertname" ];
group_wait = "3m";
};
receivers = [
{
name = "matrix";
webhook_configs = lib.singleton {
url = (lib.elemAt
(lib.filter
({ ID, ... }: ID == "alertmanager_service")
config.services.go-neb.config.services)
0).Config.webhook_url;
};
}
];
};
};
scrapeConfigs = [
{
job_name = "prometheus";
static_configs = mkStaticTarget "localhost:${toString cfg.port}";
}
{
job_name = "node";
static_configs = mkStaticTargets [
"fuuko.vpn.sbruder.de:9100"
"mayushii.vpn.sbruder.de:9100"
"nunotaba.vpn.sbruder.de:9100"
"renge.vpn.sbruder.de:9100"
"hitagi.vpn.sbruder.de:9100"
"vueko.vpn.sbruder.de:9100"
];
}
{
job_name = "qbittorrent";
static_configs = mkStaticTargets [
"fuuko.vpn.sbruder.de:9561"
];
relabel_configs = lib.singleton {
target_label = "instance";
source_labels = lib.singleton "__address__";
regex = "(.*)\\.vpn\\.sbruder\\.de:9561";
};
}
(
let
listenerCfg = (lib.elemAt config.services.matrix-synapse.settings.listeners 0);
in
{
job_name = "synapse";
static_configs = mkStaticTarget "${lib.elemAt listenerCfg.bind_addresses 0}:${toString listenerCfg.port}";
metrics_path = "/_synapse/metrics";
relabel_configs = lib.singleton {
target_label = "instance";
replacement = "matrix.sbruder.de";
};
}
)
{
job_name = "dnsmasq";
static_configs = mkStaticTarget "fuuko.vpn.sbruder.de:${toString config.services.prometheus.exporters.dnsmasq.port}";
relabel_configs = lib.singleton {
target_label = "instance";
replacement = "fuuko.home.sbruder.de";
};
}
{
job_name = "hcloud";
static_configs = mkStaticTarget config.services.hcloud_exporter.listenAddress;
}
{
job_name = "co2";
static_configs = mkStaticTarget "fuuko.vpn.sbruder.de:9672";
}
];
rules =
let
mkAlert = { name, expr, for ? "1m", description ? null }: {
alert = name;
inherit expr for;
annotations = lib.optionalAttrs (description != null) { inherit description; };
};
in
[
(lib.generators.toYAML { } {
groups = lib.singleton {
name = "alert.rules";
rules = map mkAlert [
{
name = "InstanceDown";
expr = ''up{instance!~"(nunotaba|hitagi|mayushii|fuuko).vpn.sbruder.de:.*"} == 0'';
description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.";
}
{
name = "SystemdUnitFailed";
expr = ''node_systemd_unit_state{state="failed"} == 1'';
description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed.";
}
{
name = "NodeHighLoad";
expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2'';
for = "15m";
description = "Node {{ $labels.instance }} is having a per-core load 2 for the last 15 minutes.";
}
{
name = "NodeHighMemory";
expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9'';
for = "2m";
description = "Node {{ $labels.instance }} is using more than 90% of available RAM.";
}
{
name = "TP440ACPIBroken";
expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48'';
for = "10m";
description = "Thinkpad T440s ACPI temperature is broken. Its reported temperature is 48°C for the last 10 minutes. That doesnt seem right. Try suspending";
}
{
name = "TorrentNoPeers";
expr = "sum by (instance) (qBittorrent_torrent_connected_leechs) == 0";
description = "qBittorrent instance {{ $labels.instance }} has no peers. There might be a network connectivity problem";
}
];
};
})
];
};
# exporters that are not part of nixpkgs prometheus infrastructure
services.hcloud_exporter = {
enable = true;
listenAddress = "127.0.0.1:9501";
environmentFile = config.sops.secrets.hcloud_exporter-environment.path;
};
sops.secrets.hcloud_exporter-environment.sopsFile = ../secrets.yaml;
sops.secrets.prometheus-htpasswd = {
owner = "nginx";
sopsFile = ../secrets.yaml;
};
services.nginx.virtualHosts."prometheus.sbruder.de" = {
enableACME = true;
forceSSL = true;
basicAuthFile = config.sops.secrets.prometheus-htpasswd.path;
locations = {
"/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}";
"/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
};
};
}