254 lines
8.6 KiB
Nix
254 lines
8.6 KiB
Nix
{ config, lib, pkgs, ... }:
|
||
let
|
||
cfg = config.services.prometheus;
|
||
|
||
mkStaticTargets = targets: lib.singleton { inherit targets; };
|
||
mkStaticTarget = target: mkStaticTargets (lib.singleton target);
|
||
in
|
||
{
|
||
services.prometheus = {
|
||
enable = true;
|
||
listenAddress = "127.0.0.1";
|
||
webExternalUrl = "https://prometheus.sbruder.de";
|
||
globalConfig = {
|
||
scrape_interval = "15s";
|
||
evaluation_interval = "15s";
|
||
};
|
||
extraFlags = [
|
||
"--storage.tsdb.retention.time=90d"
|
||
"--web.enable-admin-api"
|
||
];
|
||
|
||
alertmanagers = [
|
||
{
|
||
static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
|
||
path_prefix = "/alertmanager/";
|
||
}
|
||
];
|
||
alertmanager = {
|
||
enable = true;
|
||
listenAddress = "127.0.0.1";
|
||
webExternalUrl = "https://prometheus.sbruder.de/alertmanager";
|
||
configuration = {
|
||
global.resolve_timeout = "2m";
|
||
|
||
route = {
|
||
receiver = "matrix";
|
||
group_by = [ "alertname" ];
|
||
group_wait = "3m";
|
||
};
|
||
|
||
receivers = [
|
||
{
|
||
name = "matrix";
|
||
webhook_configs = lib.singleton {
|
||
url = (lib.elemAt
|
||
(lib.filter
|
||
({ ID, ... }: ID == "alertmanager_service")
|
||
config.services.go-neb.config.services)
|
||
0).Config.webhook_url;
|
||
};
|
||
}
|
||
];
|
||
};
|
||
};
|
||
|
||
scrapeConfigs = [
|
||
{
|
||
job_name = "prometheus";
|
||
static_configs = mkStaticTarget "localhost:${toString cfg.port}";
|
||
}
|
||
{
|
||
job_name = "node";
|
||
static_configs = mkStaticTargets [
|
||
"fuuko.vpn.sbruder.de:9100"
|
||
"mayushii.vpn.sbruder.de:9100"
|
||
"nunotaba.vpn.sbruder.de:9100"
|
||
"renge.vpn.sbruder.de:9100"
|
||
"hitagi.vpn.sbruder.de:9100"
|
||
"vueko.vpn.sbruder.de:9100"
|
||
"okarin.vpn.sbruder.de:9100"
|
||
"shinobu.vpn.sbruder.de:9100"
|
||
"nazuna.vpn.sbruder.de:9100"
|
||
"yuzuru.vpn.sbruder.de:9100"
|
||
];
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
source_labels = lib.singleton "__address__";
|
||
regex = "(.*)\\.vpn\\.sbruder\\.de:9100";
|
||
};
|
||
}
|
||
{
|
||
job_name = "qbittorrent";
|
||
static_configs = mkStaticTargets [
|
||
"fuuko.vpn.sbruder.de:9561"
|
||
"nazuna.vpn.sbruder.de:9561"
|
||
];
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
source_labels = lib.singleton "__address__";
|
||
regex = "(.*)\\.vpn\\.sbruder\\.de:9561";
|
||
};
|
||
}
|
||
(
|
||
let
|
||
listenerCfg = (lib.elemAt config.services.matrix-synapse.settings.listeners 0);
|
||
in
|
||
{
|
||
job_name = "synapse";
|
||
static_configs = mkStaticTarget "${lib.elemAt listenerCfg.bind_addresses 0}:${toString listenerCfg.port}";
|
||
metrics_path = "/_synapse/metrics";
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
replacement = "matrix.sbruder.de";
|
||
};
|
||
}
|
||
)
|
||
{
|
||
job_name = "dnsmasq";
|
||
static_configs = mkStaticTarget "shinobu.vpn.sbruder.de:${toString config.services.prometheus.exporters.dnsmasq.port}";
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
replacement = "shinobu";
|
||
};
|
||
}
|
||
{
|
||
job_name = "hcloud";
|
||
static_configs = mkStaticTarget config.services.hcloud_exporter.listenAddress;
|
||
}
|
||
{
|
||
job_name = "co2";
|
||
static_configs = mkStaticTarget "shinobu.vpn.sbruder.de:9672";
|
||
}
|
||
{
|
||
job_name = "rspamd";
|
||
static_configs = mkStaticTarget "vueko.vpn.sbruder.de";
|
||
metrics_path = "/rspamd/metrics";
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
replacement = "vueko.sbruder.de";
|
||
};
|
||
}
|
||
{
|
||
job_name = "knot";
|
||
static_configs = mkStaticTargets [
|
||
"okarin.vpn.sbruder.de:9433"
|
||
"vueko.vpn.sbruder.de:9433"
|
||
];
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
source_labels = lib.singleton "__address__";
|
||
regex = "(.*)\\.vpn\\.sbruder\\.de:9433";
|
||
};
|
||
}
|
||
{
|
||
job_name = "snmp";
|
||
metrics_path = "/snmp";
|
||
params = {
|
||
module = [ "if_mib" ];
|
||
};
|
||
static_configs = mkStaticTargets [
|
||
"karibik.management.shinonome-lab.de"
|
||
];
|
||
relabel_configs = [
|
||
{
|
||
source_labels = lib.singleton "__address__";
|
||
target_label = "__param_target";
|
||
}
|
||
{
|
||
source_labels = lib.singleton "__param_target";
|
||
target_label = "instance";
|
||
}
|
||
{
|
||
target_label = "__address__";
|
||
replacement = "shinobu.vpn.sbruder.de:9116";
|
||
}
|
||
];
|
||
}
|
||
];
|
||
|
||
rules =
|
||
let
|
||
mkAlert = { name, expr, for ? "1m", description ? null }: {
|
||
alert = name;
|
||
inherit expr for;
|
||
annotations = lib.optionalAttrs (description != null) { inherit description; };
|
||
};
|
||
in
|
||
[
|
||
(lib.generators.toYAML { } {
|
||
groups = lib.singleton {
|
||
name = "alert.rules";
|
||
rules = map mkAlert [
|
||
{
|
||
name = "InstanceDown";
|
||
expr = ''up{instance!~"(nunotaba|hitagi|mayushii|fuuko)"} == 0'';
|
||
description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.";
|
||
}
|
||
{
|
||
name = "SystemdUnitFailed";
|
||
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
||
description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed.";
|
||
}
|
||
{
|
||
name = "NodeHighLoad";
|
||
expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2'';
|
||
for = "15m";
|
||
description = "Node {{ $labels.instance }} is having a per-core load ≥ 2 for the last 15 minutes.";
|
||
}
|
||
{
|
||
name = "NodeHighMemory";
|
||
expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9'';
|
||
for = "2m";
|
||
description = "Node {{ $labels.instance }} is using more than 90 % of available RAM.";
|
||
}
|
||
{
|
||
name = "TP440ACPIBroken";
|
||
expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48'';
|
||
for = "10m";
|
||
description = "Thinkpad T440’s ACPI temperature is broken. Its reported temperature is 48 °C for the last 10 minutes. That doesn’t seem right. Try suspending";
|
||
}
|
||
{
|
||
name = "TorrentNoPeers";
|
||
expr = "sum by (instance) (qBittorrent_torrent_connected_leechs) == 0";
|
||
description = "qBittorrent instance {{ $labels.instance }} has no peers. There might be a network connectivity problem";
|
||
}
|
||
# <40% is to account for /boot being full (which causes ugly errors on rebuild)
|
||
{
|
||
name = "DiskFull";
|
||
expr = ''node_filesystem_free_bytes{fstype!~"ramfs|tmpfs", mountpoint!~"/nix/store"} / node_filesystem_size_bytes{fstype!~"ramfs|tmpfs", mountpoint!~"/nix/store"} < 0.4 and node_filesystem_free_bytes{fstype!~"ramfs|tmpfs", mountpoint!~"/nix/store"} < 4*1024^3'';
|
||
description = "Device {{ $labels.device }} on {{ $labels.instance }}:{{ $labels.mountpoint }} has less than 4GiB free space while being used over 40%";
|
||
}
|
||
];
|
||
};
|
||
})
|
||
];
|
||
};
|
||
|
||
# exporters that are not part of nixpkgs’ prometheus infrastructure
|
||
services.hcloud_exporter = {
|
||
enable = true;
|
||
listenAddress = "127.0.0.1:9501";
|
||
environmentFile = config.sops.secrets.hcloud_exporter-environment.path;
|
||
};
|
||
sops.secrets.hcloud_exporter-environment.sopsFile = ../secrets.yaml;
|
||
|
||
sops.secrets.prometheus-htpasswd = {
|
||
owner = "nginx";
|
||
sopsFile = ../secrets.yaml;
|
||
};
|
||
|
||
services.nginx.virtualHosts."prometheus.sbruder.de" = {
|
||
enableACME = true;
|
||
forceSSL = true;
|
||
|
||
basicAuthFile = config.sops.secrets.prometheus-htpasswd.path;
|
||
|
||
locations = {
|
||
"/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}";
|
||
|
||
"/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
|
||
};
|
||
};
|
||
}
|