nixos-config/machines/renge/services/prometheus.nix

185 lines
6.1 KiB
Nix
Raw Normal View History

2021-02-24 20:55:07 +01:00
{ config, lib, pkgs, ... }:
let
cfg = config.services.prometheus;
mkStaticTargets = targets: lib.singleton { inherit targets; };
mkStaticTarget = target: mkStaticTargets (lib.singleton target);
in
{
services.prometheus = {
enable = true;
listenAddress = "127.0.0.1";
2021-04-02 16:44:17 +02:00
webExternalUrl = "https://prometheus.sbruder.de";
2021-02-24 20:55:07 +01:00
globalConfig = {
scrape_interval = "15s";
evaluation_interval = "15s";
};
extraFlags = [
"--storage.tsdb.retention.time=90d"
2021-03-28 11:04:48 +02:00
"--web.enable-admin-api"
];
2021-02-24 20:55:07 +01:00
alertmanagers = [
2021-04-02 16:44:17 +02:00
{
static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
path_prefix = "/alertmanager/";
}
2021-02-24 20:55:07 +01:00
];
alertmanager = {
enable = true;
listenAddress = "127.0.0.1";
2021-04-02 16:44:17 +02:00
webExternalUrl = "https://prometheus.sbruder.de/alertmanager";
2021-02-24 20:55:07 +01:00
configuration = {
global.resolve_timeout = "2m";
route = {
receiver = "matrix";
group_by = [ "alertname" ];
group_wait = "3m";
};
receivers = [
{
name = "matrix";
webhook_configs = lib.singleton {
url = (lib.elemAt
(lib.filter
({ ID, ... }: ID == "alertmanager_service")
config.services.go-neb.config.services)
0).Config.webhook_url;
};
2021-02-24 20:55:07 +01:00
}
];
};
};
scrapeConfigs = [
{
job_name = "prometheus";
static_configs = mkStaticTarget "localhost:${toString cfg.port}";
}
{
job_name = "node";
static_configs = mkStaticTargets [
"fuuko.vpn.sbruder.de:9100"
2021-09-30 07:32:03 +02:00
"mayushii.vpn.sbruder.de:9100"
2022-06-09 17:38:24 +02:00
"nunotaba.vpn.sbruder.de:9100"
2022-03-23 15:03:08 +01:00
"renge.vpn.sbruder.de:9100"
2022-12-30 19:52:58 +01:00
"hitagi.vpn.sbruder.de:9100"
2021-02-24 20:55:07 +01:00
"vueko.vpn.sbruder.de:9100"
];
}
2021-02-19 18:35:44 +01:00
{
2022-03-18 23:25:05 +01:00
job_name = "qbittorrent";
2022-03-19 10:17:25 +01:00
static_configs = mkStaticTargets [
"fuuko.vpn.sbruder.de:9561"
];
2021-02-19 18:35:44 +01:00
relabel_configs = lib.singleton {
target_label = "instance";
2022-03-19 10:17:25 +01:00
source_labels = lib.singleton "__address__";
regex = "(.*)\\.vpn\\.sbruder\\.de:9561";
2021-02-19 18:35:44 +01:00
};
}
2021-03-18 13:01:59 +01:00
(
let
listenerCfg = (lib.elemAt config.services.matrix-synapse.settings.listeners 0);
2021-03-18 13:01:59 +01:00
in
{
job_name = "synapse";
static_configs = mkStaticTarget "${lib.elemAt listenerCfg.bind_addresses 0}:${toString listenerCfg.port}";
2021-03-18 13:01:59 +01:00
metrics_path = "/_synapse/metrics";
relabel_configs = lib.singleton {
target_label = "instance";
replacement = "matrix.sbruder.de";
};
}
)
2021-08-28 13:53:38 +02:00
{
job_name = "hcloud";
static_configs = mkStaticTarget config.services.hcloud_exporter.listenAddress;
}
2022-11-03 16:40:05 +01:00
{
job_name = "co2";
static_configs = mkStaticTarget "fuuko.vpn.sbruder.de:9672";
}
2021-02-24 20:55:07 +01:00
];
rules =
let
mkAlert = { name, expr, for ? "1m", description ? null }: {
alert = name;
inherit expr for;
annotations = lib.optionalAttrs (description != null) { inherit description; };
};
in
[
(lib.generators.toYAML { } {
groups = lib.singleton {
name = "alert.rules";
rules = map mkAlert [
{
name = "InstanceDown";
2022-12-30 19:52:58 +01:00
expr = ''up{instance!~"(nunotaba|hitagi|mayushii|fuuko).vpn.sbruder.de:.*"} == 0'';
2021-02-24 20:55:07 +01:00
description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.";
}
{
name = "SystemdUnitFailed";
expr = ''node_systemd_unit_state{state="failed"} == 1'';
description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed.";
}
{
name = "NodeHighLoad";
expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2'';
for = "15m";
description = "Node {{ $labels.instance }} is having a per-core load 2 for the last 15 minutes.";
2021-02-24 20:55:07 +01:00
}
{
name = "NodeHighMemory";
expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9'';
for = "2m";
description = "Node {{ $labels.instance }} is using more than 90% of available RAM.";
2021-02-24 20:55:07 +01:00
}
2022-06-09 17:38:24 +02:00
{
name = "TP440ACPIBroken";
expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48'';
for = "10m";
description = "Thinkpad T440s ACPI temperature is broken. Its reported temperature is 48°C for the last 10 minutes. That doesnt seem right. Try suspending";
}
2021-02-19 18:35:44 +01:00
{
name = "TorrentNoPeers";
2022-03-18 23:25:05 +01:00
expr = "sum by (instance) (qBittorrent_torrent_connected_leechs) == 0";
description = "qBittorrent instance {{ $labels.instance }} has no peers. There might be a network connectivity problem";
2021-02-19 18:35:44 +01:00
}
2021-02-24 20:55:07 +01:00
];
};
})
];
};
2021-08-28 13:53:38 +02:00
# exporters that are not part of nixpkgs prometheus infrastructure
services.hcloud_exporter = {
enable = true;
listenAddress = "127.0.0.1:9501";
environmentFile = config.sops.secrets.hcloud_exporter-environment.path;
};
sops.secrets.hcloud_exporter-environment.sopsFile = ../secrets.yaml;
sops.secrets.prometheus-htpasswd = {
owner = "nginx";
sopsFile = ../secrets.yaml;
2021-02-24 20:55:07 +01:00
};
services.nginx.virtualHosts."prometheus.sbruder.de" = {
enableACME = true;
forceSSL = true;
basicAuthFile = config.sops.secrets.prometheus-htpasswd.path;
2021-02-24 20:55:07 +01:00
locations = {
"/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}";
2021-04-02 16:44:17 +02:00
"/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}";
2021-02-24 20:55:07 +01:00
};
};
}