fuuko: Add prometheus
This commit is contained in:
parent
df303dcc2b
commit
f388995ef6
|
@ -4,6 +4,8 @@
|
|||
./hardware-configuration.nix
|
||||
../../modules
|
||||
../../users/simon
|
||||
|
||||
./services/prometheus.nix
|
||||
];
|
||||
|
||||
sbruder = {
|
||||
|
@ -25,6 +27,7 @@
|
|||
};
|
||||
};
|
||||
networking.firewall.allowedTCPPorts = [ 80 443 ];
|
||||
users.users.nginx.extraGroups = [ "keys" ];
|
||||
|
||||
networking.hostName = "fuuko";
|
||||
|
||||
|
|
134
machines/fuuko/services/prometheus.nix
Normal file
134
machines/fuuko/services/prometheus.nix
Normal file
|
@ -0,0 +1,134 @@
|
|||
{ config, lib, pkgs, ... }:
|
||||
let
|
||||
cfg = config.services.prometheus;
|
||||
|
||||
mkStaticTargets = targets: lib.singleton { inherit targets; };
|
||||
mkStaticTarget = target: mkStaticTargets (lib.singleton target);
|
||||
in
|
||||
{
|
||||
systemd.mounts = lib.singleton {
|
||||
after = [ "data.mount" ];
|
||||
before = [ "prometheus.service" ];
|
||||
wantedBy = [ "local-fs.target" ];
|
||||
what = "/data/prometheus";
|
||||
where = config.systemd.services.prometheus.serviceConfig.WorkingDirectory;
|
||||
type = "none";
|
||||
options = "bind";
|
||||
};
|
||||
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
globalConfig = {
|
||||
scrape_interval = "15s";
|
||||
evaluation_interval = "15s";
|
||||
};
|
||||
|
||||
alertmanagers = [
|
||||
{ static_configs = mkStaticTarget "${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}"; }
|
||||
];
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
listenAddress = "127.0.0.1";
|
||||
configuration = {
|
||||
global.resolve_timeout = "2m";
|
||||
|
||||
route = {
|
||||
receiver = "matrix";
|
||||
group_by = [ "alertname" ];
|
||||
group_wait = "3m";
|
||||
};
|
||||
|
||||
receivers = [
|
||||
{
|
||||
name = "matrix";
|
||||
webhook_configs = [
|
||||
# FIXME: Add correct URL
|
||||
{ url = "http://matrix-alertmanager:3000/alerts"; }
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "prometheus";
|
||||
static_configs = mkStaticTarget "localhost:${toString cfg.port}";
|
||||
}
|
||||
{
|
||||
job_name = "node";
|
||||
static_configs = mkStaticTargets [
|
||||
"fuuko.vpn.sbruder.de:9100"
|
||||
"issei.vpn.sbruder.de:9100"
|
||||
"nunotaba.vpn.sbruder.de:9100"
|
||||
"sayuri.vpn.sbruder.de:9100"
|
||||
"vueko.vpn.sbruder.de:9100"
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
rules =
|
||||
let
|
||||
mkAlert = { name, expr, for ? "1m", description ? null }: {
|
||||
alert = name;
|
||||
inherit expr for;
|
||||
annotations = lib.optionalAttrs (description != null) { inherit description; };
|
||||
};
|
||||
in
|
||||
[
|
||||
(lib.generators.toYAML { } {
|
||||
groups = lib.singleton {
|
||||
name = "alert.rules";
|
||||
rules = map mkAlert [
|
||||
{
|
||||
name = "InstanceDown";
|
||||
expr = ''up{instance!~"(nunotaba|sayuri).vpn.sbruder.de:.*"} == 0'';
|
||||
description = "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.";
|
||||
}
|
||||
{
|
||||
name = "SystemdUnitFailed";
|
||||
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
||||
description = "Systemd unit {{ $labels.name }} on {{ $labels.instance }} has state failed.";
|
||||
}
|
||||
{
|
||||
name = "NodeHighLoad";
|
||||
expr = ''sum by (instance) (node_load15) / count by (instance) (node_cpu_seconds_total{mode="system"}) > 2'';
|
||||
for = "15m";
|
||||
description = "This node is having a per-core load ≥ 2 for the last 15 minutes.";
|
||||
}
|
||||
{
|
||||
name = "NodeHighMemory";
|
||||
expr = ''(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9'';
|
||||
for = "2m";
|
||||
description = "This node is using more than 90 % of available RAM.";
|
||||
}
|
||||
{
|
||||
name = "TP440ACPIBroken";
|
||||
expr = ''node_hwmon_temp_celsius{chip="thermal_thermal_zone0",instance="nunotaba.vpn.sbruder.de:9100",job="node",sensor="temp1"} == 48'';
|
||||
for = "10m";
|
||||
description = "Thinkpad T440’s ACPI temperature is broken. Its reported temperature is 48 °C for the last 10 minutes. That doesn’t seem right. Try suspending";
|
||||
}
|
||||
];
|
||||
};
|
||||
})
|
||||
];
|
||||
};
|
||||
|
||||
krops.secrets.prometheus-htpasswd = {
|
||||
group = "nginx";
|
||||
};
|
||||
|
||||
services.nginx.virtualHosts."prometheus.sbruder.de" = {
|
||||
enableACME = true;
|
||||
forceSSL = true;
|
||||
|
||||
basicAuthFile = config.krops.secrets.prometheus-htpasswd.path;
|
||||
|
||||
locations = {
|
||||
"/".proxyPass = "http://${cfg.listenAddress}:${toString cfg.port}";
|
||||
|
||||
"/alertmanager/".proxyPass = "http://${cfg.alertmanager.listenAddress}:${toString cfg.alertmanager.port}/";
|
||||
};
|
||||
};
|
||||
}
|
Loading…
Reference in a new issue