Files
server-config/modules/ntfy-alerts.nix

133 lines
3.7 KiB
Nix

{
config,
lib,
pkgs,
...
}:
let
cfg = config.services.ntfyAlerts;
curl = "${pkgs.curl}/bin/curl";
hostname = config.networking.hostName;
# Build the curl auth args as a proper bash array fragment
authCurlArgs =
if cfg.tokenFile != null then
''
if [ -f "${cfg.tokenFile}" ]; then
TOKEN=$(cat "${cfg.tokenFile}" 2>/dev/null || echo "")
if [ -n "$TOKEN" ]; then
AUTH_ARGS=(-H "Authorization: Bearer $TOKEN")
fi
fi
''
else
"";
# Systemd failure alert script
systemdAlertScript = pkgs.writeShellScript "ntfy-systemd-alert" ''
set -euo pipefail
UNIT_NAME="$1"
SERVER_URL="${cfg.serverUrl}"
TOPIC=$(cat "${cfg.topicFile}" 2>/dev/null | tr -d '[:space:]')
if [ -z "$TOPIC" ]; then
echo "ERROR: Could not read topic from ${cfg.topicFile}"
exit 1
fi
# Get journal output for context
JOURNAL_OUTPUT=$(${pkgs.systemd}/bin/journalctl -u "$UNIT_NAME" -n 15 --no-pager 2>/dev/null || echo "No journal output available")
# Build auth args
AUTH_ARGS=()
${authCurlArgs}
# Send notification
${curl} -sf --max-time 15 -X POST \
"$SERVER_URL/$TOPIC" \
-H "Title: [${hostname}] Service failed: $UNIT_NAME" \
-H "Priority: high" \
-H "Tags: warning" \
"''${AUTH_ARGS[@]}" \
-d "$JOURNAL_OUTPUT" || true
'';
in
{
options.services.ntfyAlerts = {
enable = lib.mkEnableOption "ntfy push notifications for system alerts";
serverUrl = lib.mkOption {
type = lib.types.str;
description = "The ntfy server URL (e.g. https://ntfy.example.com)";
example = "https://ntfy.example.com";
};
topicFile = lib.mkOption {
type = lib.types.path;
description = "Path to a file containing the ntfy topic name to publish alerts to.";
example = "/run/agenix/ntfy-alerts-topic";
};
tokenFile = lib.mkOption {
type = lib.types.nullOr lib.types.path;
default = null;
description = ''
Path to a file containing the ntfy auth token.
If set, uses Authorization: Bearer header for authentication.
'';
example = "/run/secrets/ntfy-token";
};
};
config = lib.mkIf cfg.enable {
# Per-service OnFailure for monitored services
systemd.services = {
"ntfy-alert@" = {
description = "Send ntfy notification for failed service %i";
unitConfig.OnFailure = lib.mkForce "";
serviceConfig = {
Type = "oneshot";
ExecStart = "${systemdAlertScript} %i";
TimeoutSec = 30;
};
};
# TODO: sanoid's ExecStartPre runs `zfs allow` which blocks on TXG sync;
# on the hdds pool (slow spinning disks + large async frees) this causes
# 30+ minute hangs and guaranteed timeouts. Suppress until we fix sanoid
# to run as root without `zfs allow`. See: nixpkgs#72060, openzfs/zfs#14180
"sanoid".unitConfig.OnFailure = lib.mkForce "";
};
# Global OnFailure drop-in for all services
systemd.packages = [
(pkgs.writeTextDir "etc/systemd/system/service.d/onfailure.conf" ''
[Unit]
OnFailure=ntfy-alert@%p.service
'')
# Sanoid-specific drop-in to override the global OnFailure (see TODO above)
(pkgs.writeTextDir "etc/systemd/system/sanoid.service.d/onfailure.conf" ''
[Unit]
OnFailure=
'')
];
# ZED (ZFS Event Daemon) ntfy notification settings
services.zfs.zed = {
enableMail = false;
settings = {
ZED_NTFY_URL = cfg.serverUrl;
ZED_NTFY_TOPIC = "$(cat ${cfg.topicFile} | tr -d '[:space:]')";
ZED_NTFY_ACCESS_TOKEN = lib.mkIf (cfg.tokenFile != null) "$(cat ${cfg.tokenFile})";
ZED_NOTIFY_VERBOSE = true;
};
};
};
}