Files
nix/modules/openclaw-watchdog.nix
2026-03-28 00:02:32 +02:00

83 lines
2.2 KiB
Nix

{ pkgs, ... }:
{
systemd.services.openclaw-watchdog = {
description = "Post-rebuild health watchdog";
after = [ "network.target" ];
serviceConfig = {
Type = "oneshot";
ExecStart = "/etc/openclaw/nixos-rollback.sh check";
};
onFailure = [ "nixos-rollback.service" ];
};
systemd.services.nixos-rollback = {
description = "Autonomous NixOS rollback";
serviceConfig = {
Type = "oneshot";
ExecStart = "/etc/openclaw/nixos-rollback.sh rollback";
};
};
environment.etc."openclaw/nixos-rollback.sh" = {
mode = "0750";
text = ''
#!/usr/bin/env bash
set -euo pipefail
WEBHOOK="$(cat /run/secrets/discord-webhook 2>/dev/null || echo "")"
UNITS=("sshd" "docker" "bluesky-pds" "cloudflared" "zipline")
HOSTNAME="$(hostname)"
notify() {
[ -z "$WEBHOOK" ] && return
curl -s -X POST "$WEBHOOK" \
-H "Content-Type: application/json" \
-d "{\"content\": \"$1\"}"
}
check_units() {
for unit in "''${UNITS[@]}"; do
if ! systemctl is-active --quiet "$unit"; then
return 1
fi
done
return 0
}
check_ssh() {
timeout 5 bash -c 'echo > /dev/tcp/127.0.0.1/22' 2>/dev/null
}
do_check() {
for i in $(seq 1 6); do
sleep 10
if check_units && check_ssh; then
notify "**[$HOSTNAME] NixOS switch healthy** all units OK after rebuild."
exit 0
fi
done
exit 1
}
do_rollback() {
notify "**[$HOSTNAME] ROLLBACK TRIGGERED** health check failed. Rolling back..."
if nixos-rebuild switch --rollback; then
sleep 15
if check_units && check_ssh; then
notify "**[$HOSTNAME] Rollback successful** previous generation restored."
else
notify "**[$HOSTNAME] URGENT rollback also failed.** Manual intervention needed."
fi
else
notify "**[$HOSTNAME] URGENT rollback command failed.** Manual intervention needed."
fi
}
case "''${1:-check}" in
check) do_check ;;
rollback) do_rollback ;;
esac
'';
};
}