{ pkgs, ... }: { systemd.services.openclaw-watchdog = { description = "Post-rebuild health watchdog"; after = [ "network.target" ]; serviceConfig = { Type = "oneshot"; ExecStart = "/etc/openclaw/nixos-rollback.sh check"; }; onFailure = [ "nixos-rollback.service" ]; }; systemd.services.nixos-rollback = { description = "Autonomous NixOS rollback"; serviceConfig = { Type = "oneshot"; ExecStart = "/etc/openclaw/nixos-rollback.sh rollback"; }; }; environment.etc."openclaw/nixos-rollback.sh" = { mode = "0750"; text = '' #!/usr/bin/env bash set -euo pipefail WEBHOOK="$(cat /run/secrets/discord-webhook 2>/dev/null || echo "")" UNITS=("sshd" "docker" "bluesky-pds" "cloudflared" "zipline") HOSTNAME="$(hostname)" notify() { [ -z "$WEBHOOK" ] && return curl -s -X POST "$WEBHOOK" \ -H "Content-Type: application/json" \ -d "{\"content\": \"$1\"}" } check_units() { for unit in "''${UNITS[@]}"; do if ! systemctl is-active --quiet "$unit"; then return 1 fi done return 0 } check_ssh() { timeout 5 bash -c 'echo > /dev/tcp/127.0.0.1/22' 2>/dev/null } do_check() { for i in $(seq 1 6); do sleep 10 if check_units && check_ssh; then notify "**[$HOSTNAME] NixOS switch healthy** — all units OK after rebuild." exit 0 fi done exit 1 } do_rollback() { notify "**[$HOSTNAME] ROLLBACK TRIGGERED** — health check failed. Rolling back..." if nixos-rebuild switch --rollback; then sleep 15 if check_units && check_ssh; then notify "**[$HOSTNAME] Rollback successful** — previous generation restored." else notify "**[$HOSTNAME] URGENT — rollback also failed.** Manual intervention needed." fi else notify "**[$HOSTNAME] URGENT — rollback command failed.** Manual intervention needed." fi } case "''${1:-check}" in check) do_check ;; rollback) do_rollback ;; esac ''; }; }