Simon Bruder
ebfa0ec16a
Sadly, they are so interconnected, that it is not easily possible to migrate it in smaller steps. It should be refactored to make them more modularised and independent of each other.
192 lines
6.2 KiB
Nix
192 lines
6.2 KiB
Nix
{ config, lib, pkgs, ... }:
|
||
let
|
||
cfg = config.services.prometheus;
|
||
|
||
mkStaticTargets = targets: lib.singleton { inherit targets; };
|
||
mkStaticTarget = target: mkStaticTargets (lib.singleton target);
|
||
in
|
||
{
|
||
services.prometheus = {
|
||
enable = true;
|
||
listenAddress = "127.0.0.1";
|
||
webExternalUrl = "https://prometheus.sbruder.de";
|
||
globalConfig = {
|
||
scrape_interval = "15s";
|
||
evaluation_interval = "15s";
|
||
};
|
||
extraFlags = [
|
||
"--storage.tsdb.retention.time=90d"
|
||
"--web.enable-admin-api"
|
||
];
|
||
|
||
alertmanagers = [
|
||
{
|
||
static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
|
||
path_prefix = "/alertmanager/";
|
||
}
|
||
];
|
||
alertmanager = {
|
||
enable = true;
|
||
listenAddress = "127.0.0.1";
|
||
webExternalUrl = "https://prometheus.sbruder.de/alertmanager";
|
||
configuration = {
|
||
global.resolve_timeout = "2m";
|
||
|
||
route = {
|
||
receiver = "matrix";
|
||
group_by = [ "alertname" ];
|
||
group_wait = "3m";
|
||
};
|
||
|
||
receivers = [
|
||
{
|
||
name = "matrix";
|
||
webhook_configs = lib.singleton {
|
||
url = (lib.elemAt
|
||
(lib.filter
|
||
({ ID, ... }: ID == "alertmanager_service")
|
||
config.services.go-neb.config.services)
|
||
0).Config.webhook_url;
|
||
};
|
||
}
|
||
];
|
||
};
|
||
};
|
||
|
||
scrapeConfigs = [
|
||
{
|
||
job_name = "prometheus";
|
||
static_configs = mkStaticTarget "localhost:${toString cfg.port}";
|
||
}
|
||
{
|
||
job_name = "node";
|
||
static_configs = mkStaticTargets [
|
||
"fuuko.vpn.sbruder.de:9100"
|
||
"mayushii.vpn.sbruder.de:9100"
|
||
"okarin.vpn.sbruder.de:9100"
|
||
"renge.vpn.sbruder.de:9100"
|
||
"sayuri.vpn.sbruder.de:9100"
|
||
"vueko.vpn.sbruder.de:9100"
|
||
"yuzuru.vpn.sbruder.de:9100"
|
||
];
|
||
}
|
||
{
|
||
job_name = "qbittorrent";
|
||
static_configs = mkStaticTargets [
|
||
"fuuko.vpn.sbruder.de:9561"
|
||
"okarin.vpn.sbruder.de:9561"
|
||
];
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
source_labels = lib.singleton "__address__";
|
||
regex = "(.*)\\.vpn\\.sbruder\\.de:9561";
|
||
};
|
||
}
|
||
{
|
||
job_name = "fritzbox";
|
||
static_configs = mkStaticTarget "fuuko.vpn.sbruder.de:9133";
|
||
}
|
||
(
|
||
let
|
||
listenerCfg = (lib.elemAt config.services.matrix-synapse.listeners 0);
|
||
in
|
||
{
|
||
job_name = "synapse";
|
||
static_configs = mkStaticTarget "${listenerCfg.bind_address}:${toString listenerCfg.port}";
|
||
metrics_path = "/_synapse/metrics";
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
replacement = "matrix.sbruder.de";
|
||
};
|
||
}
|
||
)
|
||
{
|
||
job_name = "dnsmasq";
|
||
static_configs = mkStaticTarget "fuuko.vpn.sbruder.de:${toString config.services.prometheus.exporters.dnsmasq.port}";
|
||
relabel_configs = lib.singleton {
|
||
target_label = "instance";
|
||
replacement = "fuuko.home.sbruder.de";
|
||
};
|
||
}
|
||
{
|
||
job_name = "hcloud";
|
||
static_configs = mkStaticTarget config.services.hcloud_exporter.listenAddress;
|
||
}
|
||
];
|
||
|
||
rules =
|
||
let
|
||
mkAlert = { name, expr, for ? "1m", description ? null }: {
|
||
alert = name;
|
||
inherit expr for;
|
||
annotations = lib.optionalAttrs (description != null) { inherit description; };
|
||
};
|
||
in
|
||
[
|
||
(lib.generators.toYAML { } {
|
||
groups = lib.singleton {
|
||
name = "alert.rules";
|
||
rules = map mkAlert [
|
||
{
|
||
name = "InstanceDown";
|
||
expr = ''up{instance!~"(sayuri|mayushii).vpn.sbruder.de:.*"} == 0'';
|
||
description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.";
|
||
}
|
||
{
|
||
name = "SystemdUnitFailed";
|
||
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
||
description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed.";
|
||
}
|
||
{
|
||
name = "NodeHighLoad";
|
||
expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2'';
|
||
for = "15m";
|
||
description = "Node {{ $labels.instance }} is having a per-core load ≥ 2 for the last 15 minutes.";
|
||
}
|
||
{
|
||
name = "NodeHighMemory";
|
||
expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9'';
|
||
for = "2m";
|
||
description = "Node {{ $labels.instance }} is using more than 90 % of available RAM.";
|
||
}
|
||
{
|
||
name = "TorrentNoPeers";
|
||
expr = "sum by (instance) (qBittorrent_torrent_connected_leechs) == 0";
|
||
description = "qBittorrent instance {{ $labels.instance }} has no peers. There might be a network connectivity problem";
|
||
}
|
||
];
|
||
};
|
||
})
|
||
];
|
||
};
|
||
|
||
# get rid of “could not call action: authorization required” every scrape
|
||
systemd.services.prometheus-fritzbox-exporter.serviceConfig.StandardOutput = "null";
|
||
|
||
# exporters that are not part of nixpkgs’ prometheus infrastructure
|
||
services.hcloud_exporter = {
|
||
enable = true;
|
||
listenAddress = "127.0.0.1:9501";
|
||
environmentFile = config.sops.secrets.hcloud_exporter-environment.path;
|
||
};
|
||
sops.secrets.hcloud_exporter-environment.sopsFile = ../secrets.yaml;
|
||
|
||
sops.secrets.prometheus-htpasswd = {
|
||
owner = "nginx";
|
||
sopsFile = ../secrets.yaml;
|
||
};
|
||
|
||
services.nginx.virtualHosts."prometheus.sbruder.de" = {
|
||
enableACME = true;
|
||
forceSSL = true;
|
||
|
||
basicAuthFile = config.sops.secrets.prometheus-htpasswd.path;
|
||
|
||
locations = {
|
||
"/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}";
|
||
|
||
"/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
|
||
};
|
||
};
|
||
}
|