2021-02-24 20:55:07 +01:00
|
|
|
|
{ config, lib, pkgs, ... }:
|
|
|
|
|
let
|
|
|
|
|
cfg = config.services.prometheus;
|
|
|
|
|
|
|
|
|
|
mkStaticTargets = targets: lib.singleton { inherit targets; };
|
|
|
|
|
mkStaticTarget = target: mkStaticTargets (lib.singleton target);
|
|
|
|
|
in
|
|
|
|
|
{
|
|
|
|
|
systemd.mounts = lib.singleton {
|
|
|
|
|
after = [ "data.mount" ];
|
|
|
|
|
before = [ "prometheus.service" ];
|
|
|
|
|
wantedBy = [ "local-fs.target" ];
|
|
|
|
|
what = "/data/prometheus";
|
|
|
|
|
where = config.systemd.services.prometheus.serviceConfig.WorkingDirectory;
|
|
|
|
|
type = "none";
|
|
|
|
|
options = "bind";
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
services.prometheus = {
|
|
|
|
|
enable = true;
|
|
|
|
|
listenAddress = "127.0.0.1";
|
2021-04-02 16:44:17 +02:00
|
|
|
|
webExternalUrl = "https://prometheus.sbruder.de";
|
2021-02-24 20:55:07 +01:00
|
|
|
|
globalConfig = {
|
|
|
|
|
scrape_interval = "15s";
|
|
|
|
|
evaluation_interval = "15s";
|
|
|
|
|
};
|
2021-03-28 11:04:25 +02:00
|
|
|
|
extraFlags = [
|
|
|
|
|
"--storage.tsdb.retention.time=90d"
|
2021-03-28 11:04:48 +02:00
|
|
|
|
"--web.enable-admin-api"
|
2021-03-28 11:04:25 +02:00
|
|
|
|
];
|
2021-02-24 20:55:07 +01:00
|
|
|
|
|
|
|
|
|
alertmanagers = [
|
2021-04-02 16:44:17 +02:00
|
|
|
|
{
|
|
|
|
|
static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
|
|
|
|
|
path_prefix = "/alertmanager/";
|
|
|
|
|
}
|
2021-02-24 20:55:07 +01:00
|
|
|
|
];
|
|
|
|
|
alertmanager = {
|
|
|
|
|
enable = true;
|
|
|
|
|
listenAddress = "127.0.0.1";
|
2021-04-02 16:44:17 +02:00
|
|
|
|
webExternalUrl = "https://prometheus.sbruder.de/alertmanager";
|
2021-02-24 20:55:07 +01:00
|
|
|
|
configuration = {
|
|
|
|
|
global.resolve_timeout = "2m";
|
|
|
|
|
|
|
|
|
|
route = {
|
|
|
|
|
receiver = "matrix";
|
|
|
|
|
group_by = [ "alertname" ];
|
|
|
|
|
group_wait = "3m";
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
receivers = [
|
|
|
|
|
{
|
|
|
|
|
name = "matrix";
|
2021-04-02 17:46:07 +02:00
|
|
|
|
webhook_configs = lib.singleton {
|
|
|
|
|
url = (lib.elemAt
|
|
|
|
|
(lib.filter
|
|
|
|
|
({ ID, ... }: ID == "alertmanager_service")
|
|
|
|
|
config.services.go-neb.config.services)
|
|
|
|
|
0).Config.webhook_url;
|
|
|
|
|
};
|
2021-02-24 20:55:07 +01:00
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
scrapeConfigs = [
|
|
|
|
|
{
|
|
|
|
|
job_name = "prometheus";
|
|
|
|
|
static_configs = mkStaticTarget "localhost:${toString cfg.port}";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
job_name = "node";
|
|
|
|
|
static_configs = mkStaticTargets [
|
|
|
|
|
"fuuko.vpn.sbruder.de:9100"
|
|
|
|
|
"nunotaba.vpn.sbruder.de:9100"
|
|
|
|
|
"sayuri.vpn.sbruder.de:9100"
|
|
|
|
|
"vueko.vpn.sbruder.de:9100"
|
|
|
|
|
];
|
|
|
|
|
}
|
2021-02-19 18:35:44 +01:00
|
|
|
|
{
|
|
|
|
|
job_name = "aria2";
|
|
|
|
|
static_configs = mkStaticTarget "127.0.0.1:9578";
|
|
|
|
|
relabel_configs = lib.singleton {
|
|
|
|
|
target_label = "instance";
|
|
|
|
|
replacement = "torrent.sbruder.de";
|
|
|
|
|
};
|
|
|
|
|
}
|
2021-03-05 19:49:44 +01:00
|
|
|
|
{
|
|
|
|
|
job_name = "fritzbox";
|
|
|
|
|
static_configs = mkStaticTarget "127.0.0.1:9133";
|
|
|
|
|
}
|
2021-03-18 13:01:59 +01:00
|
|
|
|
(
|
|
|
|
|
let
|
|
|
|
|
listenerCfg = (lib.elemAt config.services.matrix-synapse.listeners 0);
|
|
|
|
|
in
|
|
|
|
|
{
|
|
|
|
|
job_name = "synapse";
|
|
|
|
|
static_configs = mkStaticTarget "${listenerCfg.bind_address}:${toString listenerCfg.port}";
|
|
|
|
|
metrics_path = "/_synapse/metrics";
|
|
|
|
|
relabel_configs = lib.singleton {
|
|
|
|
|
target_label = "instance";
|
|
|
|
|
replacement = "matrix.sbruder.de";
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
)
|
2021-04-03 18:47:01 +02:00
|
|
|
|
{
|
|
|
|
|
job_name = "drone";
|
|
|
|
|
static_configs = mkStaticTarget config.systemd.services.drone-server.environment.DRONE_SERVER_PORT;
|
|
|
|
|
relabel_configs = lib.singleton {
|
|
|
|
|
target_label = "instance";
|
|
|
|
|
replacement = "ci.sbruder.de";
|
|
|
|
|
};
|
|
|
|
|
}
|
2021-04-05 13:18:43 +02:00
|
|
|
|
{
|
|
|
|
|
job_name = "dnsmasq";
|
|
|
|
|
static_configs = mkStaticTarget (with config.services.prometheus.exporters.dnsmasq; "${listenAddress}:${toString port}");
|
|
|
|
|
relabel_configs = lib.singleton {
|
|
|
|
|
target_label = "instance";
|
|
|
|
|
replacement = "fuuko.home.sbruder.de";
|
|
|
|
|
};
|
|
|
|
|
}
|
2021-02-24 20:55:07 +01:00
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
rules =
|
|
|
|
|
let
|
|
|
|
|
mkAlert = { name, expr, for ? "1m", description ? null }: {
|
|
|
|
|
alert = name;
|
|
|
|
|
inherit expr for;
|
|
|
|
|
annotations = lib.optionalAttrs (description != null) { inherit description; };
|
|
|
|
|
};
|
|
|
|
|
in
|
|
|
|
|
[
|
|
|
|
|
(lib.generators.toYAML { } {
|
|
|
|
|
groups = lib.singleton {
|
|
|
|
|
name = "alert.rules";
|
|
|
|
|
rules = map mkAlert [
|
|
|
|
|
{
|
|
|
|
|
name = "InstanceDown";
|
|
|
|
|
expr = ''up{instance!~"(nunotaba|sayuri).vpn.sbruder.de:.*"} == 0'';
|
|
|
|
|
description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
name = "SystemdUnitFailed";
|
|
|
|
|
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
|
|
|
|
description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed.";
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
name = "NodeHighLoad";
|
|
|
|
|
expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2'';
|
|
|
|
|
for = "15m";
|
2021-04-04 14:34:44 +02:00
|
|
|
|
description = "Node {{ $labels.instance }} is having a per-core load ≥ 2 for the last 15 minutes.";
|
2021-02-24 20:55:07 +01:00
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
name = "NodeHighMemory";
|
|
|
|
|
expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9'';
|
|
|
|
|
for = "2m";
|
2021-04-04 14:34:44 +02:00
|
|
|
|
description = "Node {{ $labels.instance }} is using more than 90 % of available RAM.";
|
2021-02-24 20:55:07 +01:00
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
name = "TP440ACPIBroken";
|
|
|
|
|
expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48'';
|
|
|
|
|
for = "10m";
|
|
|
|
|
description = "Thinkpad T440’s ACPI temperature is broken. Its reported temperature is 48 °C for the last 10 minutes. That doesn’t seem right. Try suspending";
|
|
|
|
|
}
|
2021-02-19 18:35:44 +01:00
|
|
|
|
{
|
|
|
|
|
name = "TorrentNoPeers";
|
|
|
|
|
expr = "sum by (instance) (aria2_torrent_peers) == 0";
|
|
|
|
|
description = "Aria2 instance {{ $labels.instance }} has no peers. There might be a network connectivity problem";
|
|
|
|
|
}
|
2021-02-24 20:55:07 +01:00
|
|
|
|
];
|
|
|
|
|
};
|
|
|
|
|
})
|
|
|
|
|
];
|
2021-03-05 19:49:44 +01:00
|
|
|
|
|
|
|
|
|
exporters = {
|
|
|
|
|
fritzbox = {
|
|
|
|
|
enable = true;
|
|
|
|
|
gatewayAddress = "192.168.100.1";
|
|
|
|
|
listenAddress = "127.0.0.1";
|
|
|
|
|
};
|
|
|
|
|
};
|
2021-02-24 20:55:07 +01:00
|
|
|
|
};
|
|
|
|
|
|
2021-03-05 19:49:44 +01:00
|
|
|
|
# get rid of “could not call action: authorization required” every scrape
|
|
|
|
|
systemd.services.prometheus-fritzbox-exporter.serviceConfig.StandardOutput = "null";
|
|
|
|
|
|
2021-03-01 15:27:18 +01:00
|
|
|
|
sops.secrets.prometheus-htpasswd = {
|
|
|
|
|
owner = "nginx";
|
|
|
|
|
sopsFile = ../secrets.yaml;
|
2021-02-24 20:55:07 +01:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
services.nginx.virtualHosts."prometheus.sbruder.de" = {
|
|
|
|
|
enableACME = true;
|
|
|
|
|
forceSSL = true;
|
|
|
|
|
|
2021-03-01 15:27:18 +01:00
|
|
|
|
basicAuthFile = config.sops.secrets.prometheus-htpasswd.path;
|
2021-02-24 20:55:07 +01:00
|
|
|
|
|
|
|
|
|
locations = {
|
|
|
|
|
"/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}";
|
|
|
|
|
|
2021-04-02 16:44:17 +02:00
|
|
|
|
"/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
|
2021-02-24 20:55:07 +01:00
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
}
|