From ae78366a61d0c913895197b2b2653d14bc2ba368 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 29 Jun 2026 14:27:19 +0800 Subject: [PATCH] feat(recovery): add reboot auto recovery slo guard --- docs/LOGBOOK.md | 23 ++ ...priority-work-order-readback.snapshot.json | 52 +++- ...-auto-recovery-slo-scorecard.snapshot.json | 61 ++++ .../awoooi-reboot-auto-recovery-slo.service | 16 + .../awoooi-reboot-auto-recovery-slo.timer | 12 + .../reboot-auto-recovery-host-probe.sh | 67 +++++ .../reboot-auto-recovery-slo-exporter.sh | 79 +++++ .../reboot-auto-recovery-slo-scorecard.py | 273 ++++++++++++++++++ ...test_reboot_auto_recovery_slo_scorecard.py | 88 ++++++ 9 files changed, 657 insertions(+), 14 deletions(-) create mode 100644 docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json create mode 100644 scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service create mode 100644 scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer create mode 100755 scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh create mode 100755 scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh create mode 100755 scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py create mode 100644 scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 0210ebb09..2648fce3a 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -49409,3 +49409,26 @@ production browser smoke: **未做**: - 沒有使用 GitHub / gh / GitHub API;沒有讀 token / secret / `.env` / raw sessions / SQLite / auth;沒有寫 Gitea repo / refs / branch / secret;沒有 host 或 runtime 操作。 + +## 2026-06-29 — 14:25 P0-006A reboot auto-recovery 10-minute SLO control plane + +**完成內容**: +- 釐清正確 P0-006 目標:不是手動 cold-start readback,而是「所有 P0 主機重啟被自動判斷、自動觸發恢復/驗證,並在 10 分鐘內自證所有服務恢復」。 +- 清理本輪 Codex 產生的 `/tmp` 大型暫存與 generated cache,將本機 Data volume 可用空間從約 `137Mi` 拉回約 `3.3Gi`;這是 reboot SLO 的必要前置,因為先前連 `git fetch` 都因 `No space left on device` 失敗。 +- 新增 `scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh`:只讀探測 110 / 120 / 121 / 188 的 `boot_id`、uptime、systemd state 與 startup unit 狀態。 +- 新增 `scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh`、`awoooi-reboot-auto-recovery-slo.service`、`awoooi-reboot-auto-recovery-slo.timer`:提供 boot-triggered SLO verifier 與 Prometheus textfile metrics source。 +- 新增 `scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py` 與 snapshot `docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json`;目前 fail-closed:`can_claim_all_services_recovered_within_target=false`。 + +**目前真 blocker**: +- `all_host_reboot_detection_missing` +- `host_boot_probe_missing_hosts` +- `wazuh_dashboard_degraded` + +**本地驗證結果**: +- `bash -n scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh`:通過。 +- `python3.11 -m py_compile scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py`:通過。 +- `python3.11 -m pytest scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py -q`:`3 passed`。 +- 使用最新 `/tmp/awoooi-post-reboot-readiness-20260629-122825/summary.txt` 產生 scorecard:容量已非 blocker,但缺 all-host boot probe 且 Wazuh dashboard degraded,所以不得宣稱 10 分鐘全服務恢復。 + +**未做**: +- 沒有重啟任何主機;沒有 restart Docker / Nginx / K3s / DB / firewall;沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub。 diff --git a/docs/operations/awoooi-priority-work-order-readback.snapshot.json b/docs/operations/awoooi-priority-work-order-readback.snapshot.json index 992e8ea55..3ccf77ca2 100644 --- a/docs/operations/awoooi-priority-work-order-readback.snapshot.json +++ b/docs/operations/awoooi-priority-work-order-readback.snapshot.json @@ -1,7 +1,7 @@ { "schema_version": "awoooi_priority_work_order_readback_v1", - "generated_at": "2026-06-29T14:11:34+08:00", - "status": "p0_005_waiting_refs_p0_003_gitea_only_scorecard_ready", + "generated_at": "2026-06-29T14:27:32+08:00", + "status": "p0_006a_reboot_auto_recovery_slo_control_plane_added_blocked_until_live_probe", "source_refs": { "global_scorecard": "~/.codex/product-runtime-governance-completion-scorecard.snapshot.json", "workstation_dashboard": "~/.codex/codex-workstation-sync-dashboard.snapshot.json", @@ -11,10 +11,11 @@ "public_gitea_queue_readback": "ops/runner/read-public-gitea-actions-queue.py --json", "credential_escrow_scorecard": "/tmp/awoooi-credential-escrow-intake-scorecard-20260629-1200-priority.json", "dr_escrow_evidence_checklist_generator": "scripts/reboot-recovery/dr-escrow-evidence-checklist.py", - "gitea_private_inventory_p0_scorecard": "docs/operations/awoooi-gitea-private-inventory-p0-scorecard.snapshot.json" + "gitea_private_inventory_p0_scorecard": "docs/operations/awoooi-gitea-private-inventory-p0-scorecard.snapshot.json", + "reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json" }, "current_head": { - "gitea_main_sha": "86e674cf9e31118c6c52067a27b9c4e8e69cf8a0", + "gitea_main_sha": "748ee37ca958df1e1e25453363de3d9f3f02a6c1", "latest_successful_deploy_marker": "9362588ce chore(cd): deploy a423301 [skip ci]", "latest_successful_deployed_source_sha": "a4233017ad5fd03977233f3db6a4bb45d71507ed", "latest_source_readiness_commit_sha": "0c8d4e88c39157b92322fa41a92e6b15c317ac49", @@ -179,18 +180,41 @@ }, { "workplan_id": "P0-006", - "title": "清理 source-to-runtime drift 與 stale routes", - "status": "pending_after_p0_005_and_p0_003", - "reason": "P0-004 source readiness and P0-002 product manifest are green; keep drift cleanup after credential escrow and authenticated Gitea inventory readback.", + "title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", + "status": "blocked_reboot_auto_recovery_slo_not_ready", + "reason": "The required target is automatic all-host reboot detection plus boot-triggered recovery verification. Current source now has the control-plane verifier, but live all-host boot probe has not been collected and Wazuh dashboard is still degraded.", + "evidence": { + "target_minutes": 10, + "can_claim_all_services_recovered_within_target": false, + "source_controls_added": true, + "host_boot_probe_source_present": true, + "slo_systemd_timer_source_present": true, + "slo_exporter_source_present": true, + "post_start_blocked": 0, + "service_green": true, + "product_data_green": true, + "backup_core_green": true, + "wazuh_dashboard_degraded": true, + "all_host_reboot_detection_missing": true, + "host_boot_probe_missing_hosts": true, + "local_disk_free_gib_after_cleanup": 3.271 + }, "professional_fix": { - "owner": "source-runtime drift lane", - "action": "Compare product.awoooi.yaml, committed K8s/docker/workflow sources, production readback, and public routes; generate one drift list with owner and fix command per row.", + "owner": "reboot auto-recovery lane", + "action": "Deploy the boot-triggered SLO timer/exporter, collect all-host boot probes, and rerun the scorecard until it can prove all services recovered inside 10 minutes.", "exit_criteria": [ - "runtime_without_source_count=0", - "source_without_runtime_count=0", - "stale_route_count=0 or explicitly retired" + "can_claim_all_services_recovered_within_target=true", + "observed_hosts=110,120,121,188", + "max_observed_uptime_seconds<=600", + "POST_START_BLOCKED=0", + "SERVICE_GREEN=1", + "PRODUCT_DATA_GREEN=1", + "BACKUP_CORE_GREEN=1", + "WAZUH_DASHBOARD_DEGRADED=0", + "local_disk_free_gib>=2" ] - } + }, + "safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard" } ], "noise_integrated_risk_register": [ @@ -264,6 +288,6 @@ "next_execution_order": [ "P0-005: fill the single DR escrow evidence checklist with five non-secret refs and rerun one preflight.", "P0-003: convert private/internal inventory to Gitea-only readback and remove retired GitHub from active P0 blocker math.", - "P0-006: run source-to-runtime drift cleanup using product manifest, committed runtime sources, production readback, and public route evidence." + "P0-006: deploy boot-triggered reboot auto-recovery SLO verifier, collect all-host boot probe, and prove or block the 10-minute recovery claim." ] } diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json new file mode 100644 index 000000000..8660a63c4 --- /dev/null +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -0,0 +1,61 @@ +{ + "active_blockers": [ + "all_host_reboot_detection_missing", + "host_boot_probe_missing_hosts", + "wazuh_dashboard_degraded" + ], + "can_claim_all_services_recovered_within_target": false, + "capacity": { + "checked": true, + "free_gib": 2.707, + "min_free_gib": 2.0 + }, + "generated_at": "2026-06-29T14:27:32+08:00", + "host_boot_detection": { + "host_rows": [], + "max_observed_uptime_seconds": 0, + "missing_hosts": [ + "110", + "120", + "121", + "188" + ], + "observed_hosts": [], + "required_hosts": [ + "110", + "120", + "121", + "188" + ], + "stale_hosts": [], + "unknown_uptime_hosts": [], + "unreachable_hosts": [] + }, + "post_reboot_readiness": { + "backup_core_green": true, + "host_188_service_green": true, + "next_required_gates": "credential_escrow_evidence", + "overall_declaration": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED", + "post_start_blocked": 0, + "post_start_result": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED", + "product_data_green": true, + "service_green": true, + "summary_present": true, + "wazuh_dashboard_degraded": true + }, + "safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", + "schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1", + "source_controls": { + "cold_start_textfile_exporter_source_present": true, + "host_110_startup_unit_source_present": true, + "host_188_startup_unit_source_present": true, + "host_boot_probe_source_present": true, + "post_reboot_summary_source_present": true, + "slo_exporter_source_present": true, + "slo_systemd_service_source_present": true, + "slo_systemd_timer_source_present": true + }, + "status": "blocked_reboot_auto_recovery_slo_not_ready", + "target_minutes": 10, + "target_seconds": 600 +} diff --git a/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service b/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service new file mode 100644 index 000000000..ae81c7b15 --- /dev/null +++ b/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service @@ -0,0 +1,16 @@ +[Unit] +Description=AWOOOI reboot auto-recovery 10-minute SLO verifier +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +Environment=ROOT_DIR=/opt/awoooi +Environment=TARGET_MINUTES=10 +ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh +TimeoutStartSec=600 +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target diff --git a/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer b/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer new file mode 100644 index 000000000..a50db82fa --- /dev/null +++ b/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Run AWOOOI reboot auto-recovery SLO verifier after boot and until stable + +[Timer] +OnBootSec=2min +OnUnitActiveSec=2min +AccuracySec=15s +Persistent=true +Unit=awoooi-reboot-auto-recovery-slo.service + +[Install] +WantedBy=timers.target diff --git a/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh new file mode 100755 index 000000000..b38fa9929 --- /dev/null +++ b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# Read-only host boot probe for the AWOOOI reboot auto-recovery SLO. +# +# It detects whether the P0 hosts are reachable after reboot and records boot_id, +# uptime, systemd state, and the expected startup unit state. It never restarts, +# reloads, repairs, or writes host state. + +set -uo pipefail + +SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout="${SSH_CONNECT_TIMEOUT_SECONDS:-6}") +HOST_SPECS=( + "110=wooo@192.168.0.110:awoooi-startup-110.service" + "120=wooo@192.168.0.120:k3s.service" + "121=wooo@192.168.0.121:k3s.service" + "188=ollama@192.168.0.188:awoooi-startup.service" +) + +escape_value() { + printf '%s' "$1" | tr ' \t\n' '___' +} + +probe_host() { + local alias="$1" + local target="$2" + local unit="$3" + local output boot_id uptime_seconds systemd_state enabled active + + output="$(ssh "${SSH_OPTS[@]}" "$target" "unit='$unit'; \ + boot_id=\$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown); \ + uptime_seconds=\$(awk '{print int(\$1)}' /proc/uptime 2>/dev/null || echo unknown); \ + systemd_state=\$(systemctl is-system-running 2>/dev/null || true); \ + enabled=\$(systemctl is-enabled \"\$unit\" 2>/dev/null || echo unknown); \ + active=\$(systemctl is-active \"\$unit\" 2>/dev/null || echo unknown); \ + printf 'boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \"\$boot_id\" \"\$uptime_seconds\" \"\$systemd_state\" \"\$enabled\" \"\$active\" \ + " 2>/dev/null)" + if [[ $? -ne 0 || -z "$output" ]]; then + printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=0 boot_id=unknown uptime_seconds=unknown systemd_state=unknown startup_enabled=unknown startup_active=unknown\n' \ + "$alias" "$target" "$unit" + return 0 + fi + + boot_id="$(sed -n 's/.*boot_id=\([^ ]*\).*/\1/p' <<<"$output")" + uptime_seconds="$(sed -n 's/.*uptime_seconds=\([^ ]*\).*/\1/p' <<<"$output")" + systemd_state="$(sed -n 's/.*systemd_state=\([^ ]*\).*/\1/p' <<<"$output")" + enabled="$(sed -n 's/.*startup_enabled=\([^ ]*\).*/\1/p' <<<"$output")" + active="$(sed -n 's/.*startup_active=\([^ ]*\).*/\1/p' <<<"$output")" + + printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=1 boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \ + "$alias" "$target" "$unit" \ + "$(escape_value "${boot_id:-unknown}")" \ + "$(escape_value "${uptime_seconds:-unknown}")" \ + "$(escape_value "${systemd_state:-unknown}")" \ + "$(escape_value "${enabled:-unknown}")" \ + "$(escape_value "${active:-unknown}")" +} + +echo "AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1" +echo "TARGET_HOSTS=110,120,121,188" +echo "GENERATED_AT=$(date '+%Y-%m-%dT%H:%M:%S%z')" + +for spec in "${HOST_SPECS[@]}"; do + alias="${spec%%=*}" + rest="${spec#*=}" + target="${rest%%:*}" + unit="${rest#*:}" + probe_host "$alias" "$target" "$unit" +done diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh new file mode 100755 index 000000000..ebca537fd --- /dev/null +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +# Boot-triggerable 10-minute reboot recovery SLO exporter. +# +# It is read-only: it probes boot state, runs the existing post-reboot summary, +# evaluates the SLO scorecard, and writes node-exporter textfile metrics. + +set -uo pipefail + +ROOT_DIR="${ROOT_DIR:-/opt/awoooi}" +TEXTFILE_DIR="${TEXTFILE_DIR:-${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}}" +LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}" +OUTPUT_NAME="${OUTPUT_NAME:-reboot_auto_recovery_slo.prom}" +TARGET_MINUTES="${TARGET_MINUTES:-10}" +MIN_FREE_GIB="${MIN_FREE_GIB:-2}" +LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-reboot-auto-recovery-slo.lock}" + +if command -v flock >/dev/null 2>&1; then + exec 9>"$LOCK_FILE" + flock -n 9 || exit 0 +fi + +mkdir -p "$TEXTFILE_DIR" "$LOG_DIR" +run_id="$(date '+%Y%m%d-%H%M%S')" +artifact_dir="$LOG_DIR/reboot-auto-recovery-slo-$run_id" +mkdir -p "$artifact_dir" + +host_probe="$artifact_dir/host-probe.txt" +summary_file="$artifact_dir/summary.txt" +scorecard_file="$artifact_dir/scorecard.json" + +bash "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" >"$host_probe" 2>&1 || true +ARTIFACT_DIR="$artifact_dir/post-reboot-readiness" \ + bash "$ROOT_DIR/scripts/reboot-recovery/post-reboot-readiness-summary.sh" --no-color >"$summary_file" 2>&1 || true + +python3 "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py" \ + --summary-file "$summary_file" \ + --host-probe-file "$host_probe" \ + --target-minutes "$TARGET_MINUTES" \ + --min-free-gib "$MIN_FREE_GIB" \ + --disk-path / \ + --output "$scorecard_file" || true + +now="$(date +%s)" +ready="$(python3 - "$scorecard_file" <<'PY' +import json, sys +payload=json.load(open(sys.argv[1], encoding="utf-8")) +print(1 if payload.get("can_claim_all_services_recovered_within_target") else 0) +PY +)" +blocker_count="$(python3 - "$scorecard_file" <<'PY' +import json, sys +payload=json.load(open(sys.argv[1], encoding="utf-8")) +print(len(payload.get("active_blockers") or [])) +PY +)" +max_uptime="$(python3 - "$scorecard_file" <<'PY' +import json, sys +payload=json.load(open(sys.argv[1], encoding="utf-8")) +print(payload.get("host_boot_detection", {}).get("max_observed_uptime_seconds", 0)) +PY +)" + +tmp_metric="$(mktemp "$TEXTFILE_DIR/.reboot_auto_recovery_slo.XXXXXX")" +cat >"$tmp_metric" < argparse.Namespace: + parser = argparse.ArgumentParser( + description="Fail-closed scorecard for automatic reboot recovery within 10 minutes.", + ) + parser.add_argument("--summary-file", type=Path, help="post-reboot-readiness-summary output.") + parser.add_argument("--host-probe-file", type=Path, help="reboot-auto-recovery-host-probe output.") + parser.add_argument("--target-minutes", type=int, default=10) + parser.add_argument("--min-free-gib", type=float, default=2.0) + parser.add_argument("--disk-path", type=Path, help="Optionally check local free space.") + parser.add_argument("--generated-at", help="Override generated_at for stable snapshots.") + parser.add_argument("--output", type=Path, help="Write JSON to this path.") + return parser.parse_args() + + +def read_text(path: Path | None) -> str: + if not path: + return "" + return path.read_text(encoding="utf-8") + + +def parse_kv(text: str) -> dict[str, str]: + values: dict[str, str] = {} + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line or "=" not in line or line.startswith("HOST_BOOT "): + continue + key, value = line.split("=", 1) + values[key.strip()] = value.strip() + return values + + +def truthy(value: str | None) -> bool: + return value in {"1", "true", "True", "yes", "YES"} + + +def int_value(value: Any, default: int = 0) -> int: + try: + return int(str(value)) + except (TypeError, ValueError): + return default + + +def parse_host_probe(text: str) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line.startswith("HOST_BOOT "): + continue + row: dict[str, Any] = {} + for token in line.split()[1:]: + if "=" not in token: + continue + key, value = token.split("=", 1) + row[key] = value + row["reachable"] = row.get("reachable") == "1" + row["uptime_seconds"] = int_value(row.get("uptime_seconds"), -1) + rows.append(row) + return rows + + +def source_file(path: str) -> Path: + return ROOT / path + + +def file_contains(path: Path, *needles: str) -> bool: + try: + text = path.read_text(encoding="utf-8") + except FileNotFoundError: + return False + return all(needle in text for needle in needles) + + +def source_controls() -> dict[str, bool]: + return { + "host_110_startup_unit_source_present": file_contains( + source_file("scripts/reboot-recovery/awoooi-startup-110.service"), + "ExecStart=/usr/local/bin/awoooi-startup-110.sh", + "WantedBy=multi-user.target", + ) + and source_file("scripts/reboot-recovery/awoooi-startup-110.sh").exists(), + "host_188_startup_unit_source_present": file_contains( + source_file("scripts/reboot-recovery/awoooi-startup.service"), + "ExecStart=/usr/local/bin/awoooi-startup.sh", + "WantedBy=multi-user.target", + ) + and source_file("scripts/reboot-recovery/awoooi-startup.sh").exists(), + "host_boot_probe_source_present": source_file( + "scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" + ).exists(), + "slo_exporter_source_present": source_file( + "scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh" + ).exists(), + "slo_systemd_service_source_present": file_contains( + source_file("scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service"), + "ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh", + ), + "slo_systemd_timer_source_present": file_contains( + source_file("scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer"), + "OnBootSec=", + "OnUnitActiveSec=", + ), + "post_reboot_summary_source_present": source_file( + "scripts/reboot-recovery/post-reboot-readiness-summary.sh" + ).exists(), + "cold_start_textfile_exporter_source_present": source_file( + "scripts/reboot-recovery/cold-start-textfile-exporter.sh" + ).exists(), + } + + +def disk_free_gib(path: Path | None) -> float | None: + if path is None: + return None + completed = subprocess.run( + ["df", "-k", str(path)], + check=False, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + if completed.returncode != 0: + return None + lines = completed.stdout.splitlines() + if len(lines) < 2: + return None + parts = re.split(r"\s+", lines[-1].strip()) + if len(parts) < 4: + return None + return int_value(parts[3]) / 1024 / 1024 + + +def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: + target_seconds = args.target_minutes * 60 + summary = parse_kv(read_text(args.summary_file)) + host_rows = parse_host_probe(read_text(args.host_probe_file)) + controls = source_controls() + free_gib = disk_free_gib(args.disk_path) + + blockers: list[str] = [] + for key, value in controls.items(): + if not value: + blockers.append(key.replace("_present", "_missing")) + + host_aliases = {str(row.get("alias", "")) for row in host_rows} + missing_hosts = sorted(REQUIRED_HOSTS - host_aliases) + unreachable_hosts = sorted(str(row.get("alias")) for row in host_rows if not row.get("reachable")) + stale_hosts = sorted( + str(row.get("alias")) + for row in host_rows + if row.get("reachable") and int_value(row.get("uptime_seconds"), target_seconds + 1) > target_seconds + ) + unknown_uptime_hosts = sorted( + str(row.get("alias")) + for row in host_rows + if row.get("reachable") and int_value(row.get("uptime_seconds"), -1) < 0 + ) + if not host_rows: + blockers.append("all_host_reboot_detection_missing") + if missing_hosts: + blockers.append("host_boot_probe_missing_hosts") + if unreachable_hosts: + blockers.append("host_unreachable_after_reboot") + if stale_hosts: + blockers.append("host_boot_observation_older_than_target_window") + if unknown_uptime_hosts: + blockers.append("host_uptime_unknown") + + service_green = truthy(summary.get("SERVICE_GREEN")) + product_data_green = truthy(summary.get("PRODUCT_DATA_GREEN")) + backup_core_green = truthy(summary.get("BACKUP_CORE_GREEN")) + post_start_blocked = int_value(summary.get("POST_START_BLOCKED"), 999) + wazuh_dashboard_degraded = truthy(summary.get("WAZUH_DASHBOARD_DEGRADED")) + host_188_service_green = truthy(summary.get("HOST_188_SERVICE_GREEN")) + if not summary: + blockers.append("post_reboot_summary_missing") + if post_start_blocked != 0: + blockers.append("post_start_blocked_not_zero") + if not service_green: + blockers.append("service_green_not_1") + if not product_data_green: + blockers.append("product_data_green_not_1") + if not backup_core_green: + blockers.append("backup_core_green_not_1") + if not host_188_service_green: + blockers.append("host_188_service_green_not_1") + if wazuh_dashboard_degraded: + blockers.append("wazuh_dashboard_degraded") + if free_gib is not None and free_gib < args.min_free_gib: + blockers.append("local_disk_free_below_minimum") + + max_uptime = max( + [int_value(row.get("uptime_seconds"), 0) for row in host_rows if row.get("reachable")] + or [0] + ) + can_claim = not blockers + return { + "schema_version": SCHEMA_VERSION, + "generated_at": args.generated_at + or datetime.now().astimezone().isoformat(timespec="seconds"), + "target_minutes": args.target_minutes, + "target_seconds": target_seconds, + "status": "slo_ready" if can_claim else "blocked_reboot_auto_recovery_slo_not_ready", + "can_claim_all_services_recovered_within_target": can_claim, + "source_controls": controls, + "host_boot_detection": { + "required_hosts": sorted(REQUIRED_HOSTS), + "observed_hosts": sorted(host_aliases), + "missing_hosts": missing_hosts, + "unreachable_hosts": unreachable_hosts, + "stale_hosts": stale_hosts, + "unknown_uptime_hosts": unknown_uptime_hosts, + "max_observed_uptime_seconds": max_uptime, + "host_rows": host_rows, + }, + "post_reboot_readiness": { + "summary_present": bool(summary), + "post_start_result": summary.get("POST_START_RESULT", "unknown"), + "post_start_blocked": post_start_blocked, + "service_green": service_green, + "product_data_green": product_data_green, + "backup_core_green": backup_core_green, + "host_188_service_green": host_188_service_green, + "wazuh_dashboard_degraded": wazuh_dashboard_degraded, + "overall_declaration": summary.get("OVERALL_DECLARATION", "unknown"), + "next_required_gates": summary.get("NEXT_REQUIRED_GATES", "unknown"), + }, + "capacity": { + "checked": free_gib is not None, + "free_gib": round(free_gib, 3) if free_gib is not None else None, + "min_free_gib": args.min_free_gib, + }, + "active_blockers": sorted(set(blockers)), + "safe_next_step": ( + "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_" + "rerun_scorecard_until_status_slo_ready" + ), + } + + +def main() -> int: + args = parse_args() + payload = build_scorecard(args) + text = json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n" + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(text, encoding="utf-8") + else: + print(text, end="") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py new file mode 100644 index 000000000..cc3c39a70 --- /dev/null +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +SCRIPT = ROOT / "scripts" / "reboot-recovery" / "reboot-auto-recovery-slo-scorecard.py" + + +GREEN_SUMMARY = """\ +AWOOOI_POST_REBOOT_READINESS_SUMMARY=1 +POST_START_RESULT=FULL_STACK_GREEN +POST_START_BLOCKED=0 +SERVICE_GREEN=1 +PRODUCT_DATA_GREEN=1 +BACKUP_CORE_GREEN=1 +HOST_188_SERVICE_GREEN=1 +WAZUH_DASHBOARD_DEGRADED=0 +OVERALL_DECLARATION=FULL_STACK_GREEN +NEXT_REQUIRED_GATES=none +""" + + +HOST_PROBE_GREEN = """\ +AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1 +TARGET_HOSTS=110,120,121,188 +HOST_BOOT alias=110 target=wooo@192.168.0.110 startup_unit=awoooi-startup-110.service reachable=1 boot_id=a uptime_seconds=120 systemd_state=running startup_enabled=enabled startup_active=active +HOST_BOOT alias=120 target=wooo@192.168.0.120 startup_unit=k3s.service reachable=1 boot_id=b uptime_seconds=130 systemd_state=running startup_enabled=enabled startup_active=active +HOST_BOOT alias=121 target=wooo@192.168.0.121 startup_unit=k3s.service reachable=1 boot_id=c uptime_seconds=140 systemd_state=running startup_enabled=enabled startup_active=active +HOST_BOOT alias=188 target=ollama@192.168.0.188 startup_unit=awoooi-startup.service reachable=1 boot_id=d uptime_seconds=150 systemd_state=running startup_enabled=enabled startup_active=active +""" + + +def run_scorecard(tmp_path: Path, summary: str, probe: str = HOST_PROBE_GREEN) -> dict: + summary_path = tmp_path / "summary.txt" + probe_path = tmp_path / "probe.txt" + summary_path.write_text(summary, encoding="utf-8") + probe_path.write_text(probe, encoding="utf-8") + result = subprocess.run( + [ + sys.executable, + str(SCRIPT), + "--summary-file", + str(summary_path), + "--host-probe-file", + str(probe_path), + "--generated-at", + "2026-06-29T14:30:00+08:00", + ], + text=True, + capture_output=True, + check=True, + ) + return json.loads(result.stdout) + + +def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) -> None: + payload = run_scorecard(tmp_path, GREEN_SUMMARY) + + assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1" + assert payload["status"] == "slo_ready" + assert payload["can_claim_all_services_recovered_within_target"] is True + assert payload["host_boot_detection"]["max_observed_uptime_seconds"] == 150 + assert payload["active_blockers"] == [] + + +def test_missing_probe_fails_closed(tmp_path: Path) -> None: + payload = run_scorecard(tmp_path, GREEN_SUMMARY, probe="") + + assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready" + assert payload["can_claim_all_services_recovered_within_target"] is False + assert "all_host_reboot_detection_missing" in payload["active_blockers"] + assert "host_boot_probe_missing_hosts" in payload["active_blockers"] + + +def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> None: + summary = GREEN_SUMMARY.replace("WAZUH_DASHBOARD_DEGRADED=0", "WAZUH_DASHBOARD_DEGRADED=1") + probe = HOST_PROBE_GREEN.replace("uptime_seconds=150", "uptime_seconds=900") + + payload = run_scorecard(tmp_path, summary, probe=probe) + + assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready" + assert payload["can_claim_all_services_recovered_within_target"] is False + assert "wazuh_dashboard_degraded" in payload["active_blockers"] + assert "host_boot_observation_older_than_target_window" in payload["active_blockers"]