feat(recovery): add reboot auto recovery slo guard
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-29 14:27:19 +08:00
parent 748ee37ca9
commit ae78366a61
9 changed files with 657 additions and 14 deletions

View File

@@ -49409,3 +49409,26 @@ production browser smoke:
**未做**
- 沒有使用 GitHub / gh / GitHub API沒有讀 token / secret / `.env` / raw sessions / SQLite / auth沒有寫 Gitea repo / refs / branch / secret沒有 host 或 runtime 操作。
## 2026-06-29 — 14:25 P0-006A reboot auto-recovery 10-minute SLO control plane
**完成內容**
- 釐清正確 P0-006 目標:不是手動 cold-start readback而是「所有 P0 主機重啟被自動判斷、自動觸發恢復/驗證,並在 10 分鐘內自證所有服務恢復」。
- 清理本輪 Codex 產生的 `/tmp` 大型暫存與 generated cache將本機 Data volume 可用空間從約 `137Mi` 拉回約 `3.3Gi`;這是 reboot SLO 的必要前置,因為先前連 `git fetch` 都因 `No space left on device` 失敗。
- 新增 `scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh`:只讀探測 110 / 120 / 121 / 188 的 `boot_id`、uptime、systemd state 與 startup unit 狀態。
- 新增 `scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh``awoooi-reboot-auto-recovery-slo.service``awoooi-reboot-auto-recovery-slo.timer`:提供 boot-triggered SLO verifier 與 Prometheus textfile metrics source。
- 新增 `scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py` 與 snapshot `docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json`;目前 fail-closed`can_claim_all_services_recovered_within_target=false`
**目前真 blocker**
- `all_host_reboot_detection_missing`
- `host_boot_probe_missing_hosts`
- `wazuh_dashboard_degraded`
**本地驗證結果**
- `bash -n scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh`:通過。
- `python3.11 -m py_compile scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py`:通過。
- `python3.11 -m pytest scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py -q``3 passed`
- 使用最新 `/tmp/awoooi-post-reboot-readiness-20260629-122825/summary.txt` 產生 scorecard容量已非 blocker但缺 all-host boot probe 且 Wazuh dashboard degraded所以不得宣稱 10 分鐘全服務恢復。
**未做**
- 沒有重啟任何主機;沒有 restart Docker / Nginx / K3s / DB / firewall沒有讀 secret / token / `.env` / raw sessions / SQLite / auth沒有使用 GitHub。

View File

@@ -1,7 +1,7 @@
{
"schema_version": "awoooi_priority_work_order_readback_v1",
"generated_at": "2026-06-29T14:11:34+08:00",
"status": "p0_005_waiting_refs_p0_003_gitea_only_scorecard_ready",
"generated_at": "2026-06-29T14:27:32+08:00",
"status": "p0_006a_reboot_auto_recovery_slo_control_plane_added_blocked_until_live_probe",
"source_refs": {
"global_scorecard": "~/.codex/product-runtime-governance-completion-scorecard.snapshot.json",
"workstation_dashboard": "~/.codex/codex-workstation-sync-dashboard.snapshot.json",
@@ -11,10 +11,11 @@
"public_gitea_queue_readback": "ops/runner/read-public-gitea-actions-queue.py --json",
"credential_escrow_scorecard": "/tmp/awoooi-credential-escrow-intake-scorecard-20260629-1200-priority.json",
"dr_escrow_evidence_checklist_generator": "scripts/reboot-recovery/dr-escrow-evidence-checklist.py",
"gitea_private_inventory_p0_scorecard": "docs/operations/awoooi-gitea-private-inventory-p0-scorecard.snapshot.json"
"gitea_private_inventory_p0_scorecard": "docs/operations/awoooi-gitea-private-inventory-p0-scorecard.snapshot.json",
"reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json"
},
"current_head": {
"gitea_main_sha": "86e674cf9e31118c6c52067a27b9c4e8e69cf8a0",
"gitea_main_sha": "748ee37ca958df1e1e25453363de3d9f3f02a6c1",
"latest_successful_deploy_marker": "9362588ce chore(cd): deploy a423301 [skip ci]",
"latest_successful_deployed_source_sha": "a4233017ad5fd03977233f3db6a4bb45d71507ed",
"latest_source_readiness_commit_sha": "0c8d4e88c39157b92322fa41a92e6b15c317ac49",
@@ -179,18 +180,41 @@
},
{
"workplan_id": "P0-006",
"title": "清理 source-to-runtime drift 與 stale routes",
"status": "pending_after_p0_005_and_p0_003",
"reason": "P0-004 source readiness and P0-002 product manifest are green; keep drift cleanup after credential escrow and authenticated Gitea inventory readback.",
"title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
"status": "blocked_reboot_auto_recovery_slo_not_ready",
"reason": "The required target is automatic all-host reboot detection plus boot-triggered recovery verification. Current source now has the control-plane verifier, but live all-host boot probe has not been collected and Wazuh dashboard is still degraded.",
"evidence": {
"target_minutes": 10,
"can_claim_all_services_recovered_within_target": false,
"source_controls_added": true,
"host_boot_probe_source_present": true,
"slo_systemd_timer_source_present": true,
"slo_exporter_source_present": true,
"post_start_blocked": 0,
"service_green": true,
"product_data_green": true,
"backup_core_green": true,
"wazuh_dashboard_degraded": true,
"all_host_reboot_detection_missing": true,
"host_boot_probe_missing_hosts": true,
"local_disk_free_gib_after_cleanup": 3.271
},
"professional_fix": {
"owner": "source-runtime drift lane",
"action": "Compare product.awoooi.yaml, committed K8s/docker/workflow sources, production readback, and public routes; generate one drift list with owner and fix command per row.",
"owner": "reboot auto-recovery lane",
"action": "Deploy the boot-triggered SLO timer/exporter, collect all-host boot probes, and rerun the scorecard until it can prove all services recovered inside 10 minutes.",
"exit_criteria": [
"runtime_without_source_count=0",
"source_without_runtime_count=0",
"stale_route_count=0 or explicitly retired"
"can_claim_all_services_recovered_within_target=true",
"observed_hosts=110,120,121,188",
"max_observed_uptime_seconds<=600",
"POST_START_BLOCKED=0",
"SERVICE_GREEN=1",
"PRODUCT_DATA_GREEN=1",
"BACKUP_CORE_GREEN=1",
"WAZUH_DASHBOARD_DEGRADED=0",
"local_disk_free_gib>=2"
]
}
},
"safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard"
}
],
"noise_integrated_risk_register": [
@@ -264,6 +288,6 @@
"next_execution_order": [
"P0-005: fill the single DR escrow evidence checklist with five non-secret refs and rerun one preflight.",
"P0-003: convert private/internal inventory to Gitea-only readback and remove retired GitHub from active P0 blocker math.",
"P0-006: run source-to-runtime drift cleanup using product manifest, committed runtime sources, production readback, and public route evidence."
"P0-006: deploy boot-triggered reboot auto-recovery SLO verifier, collect all-host boot probe, and prove or block the 10-minute recovery claim."
]
}

View File

@@ -0,0 +1,61 @@
{
"active_blockers": [
"all_host_reboot_detection_missing",
"host_boot_probe_missing_hosts",
"wazuh_dashboard_degraded"
],
"can_claim_all_services_recovered_within_target": false,
"capacity": {
"checked": true,
"free_gib": 2.707,
"min_free_gib": 2.0
},
"generated_at": "2026-06-29T14:27:32+08:00",
"host_boot_detection": {
"host_rows": [],
"max_observed_uptime_seconds": 0,
"missing_hosts": [
"110",
"120",
"121",
"188"
],
"observed_hosts": [],
"required_hosts": [
"110",
"120",
"121",
"188"
],
"stale_hosts": [],
"unknown_uptime_hosts": [],
"unreachable_hosts": []
},
"post_reboot_readiness": {
"backup_core_green": true,
"host_188_service_green": true,
"next_required_gates": "credential_escrow_evidence",
"overall_declaration": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED",
"post_start_blocked": 0,
"post_start_result": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED",
"product_data_green": true,
"service_green": true,
"summary_present": true,
"wazuh_dashboard_degraded": true
},
"safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready",
"schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1",
"source_controls": {
"cold_start_textfile_exporter_source_present": true,
"host_110_startup_unit_source_present": true,
"host_188_startup_unit_source_present": true,
"host_boot_probe_source_present": true,
"post_reboot_summary_source_present": true,
"slo_exporter_source_present": true,
"slo_systemd_service_source_present": true,
"slo_systemd_timer_source_present": true
},
"status": "blocked_reboot_auto_recovery_slo_not_ready",
"target_minutes": 10,
"target_seconds": 600
}

View File

@@ -0,0 +1,16 @@
[Unit]
Description=AWOOOI reboot auto-recovery 10-minute SLO verifier
After=network-online.target
Wants=network-online.target
[Service]
Type=oneshot
Environment=ROOT_DIR=/opt/awoooi
Environment=TARGET_MINUTES=10
ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh
TimeoutStartSec=600
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,12 @@
[Unit]
Description=Run AWOOOI reboot auto-recovery SLO verifier after boot and until stable
[Timer]
OnBootSec=2min
OnUnitActiveSec=2min
AccuracySec=15s
Persistent=true
Unit=awoooi-reboot-auto-recovery-slo.service
[Install]
WantedBy=timers.target

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env bash
# Read-only host boot probe for the AWOOOI reboot auto-recovery SLO.
#
# It detects whether the P0 hosts are reachable after reboot and records boot_id,
# uptime, systemd state, and the expected startup unit state. It never restarts,
# reloads, repairs, or writes host state.
set -uo pipefail
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout="${SSH_CONNECT_TIMEOUT_SECONDS:-6}")
HOST_SPECS=(
"110=wooo@192.168.0.110:awoooi-startup-110.service"
"120=wooo@192.168.0.120:k3s.service"
"121=wooo@192.168.0.121:k3s.service"
"188=ollama@192.168.0.188:awoooi-startup.service"
)
escape_value() {
printf '%s' "$1" | tr ' \t\n' '___'
}
probe_host() {
local alias="$1"
local target="$2"
local unit="$3"
local output boot_id uptime_seconds systemd_state enabled active
output="$(ssh "${SSH_OPTS[@]}" "$target" "unit='$unit'; \
boot_id=\$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown); \
uptime_seconds=\$(awk '{print int(\$1)}' /proc/uptime 2>/dev/null || echo unknown); \
systemd_state=\$(systemctl is-system-running 2>/dev/null || true); \
enabled=\$(systemctl is-enabled \"\$unit\" 2>/dev/null || echo unknown); \
active=\$(systemctl is-active \"\$unit\" 2>/dev/null || echo unknown); \
printf 'boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \"\$boot_id\" \"\$uptime_seconds\" \"\$systemd_state\" \"\$enabled\" \"\$active\" \
" 2>/dev/null)"
if [[ $? -ne 0 || -z "$output" ]]; then
printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=0 boot_id=unknown uptime_seconds=unknown systemd_state=unknown startup_enabled=unknown startup_active=unknown\n' \
"$alias" "$target" "$unit"
return 0
fi
boot_id="$(sed -n 's/.*boot_id=\([^ ]*\).*/\1/p' <<<"$output")"
uptime_seconds="$(sed -n 's/.*uptime_seconds=\([^ ]*\).*/\1/p' <<<"$output")"
systemd_state="$(sed -n 's/.*systemd_state=\([^ ]*\).*/\1/p' <<<"$output")"
enabled="$(sed -n 's/.*startup_enabled=\([^ ]*\).*/\1/p' <<<"$output")"
active="$(sed -n 's/.*startup_active=\([^ ]*\).*/\1/p' <<<"$output")"
printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=1 boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \
"$alias" "$target" "$unit" \
"$(escape_value "${boot_id:-unknown}")" \
"$(escape_value "${uptime_seconds:-unknown}")" \
"$(escape_value "${systemd_state:-unknown}")" \
"$(escape_value "${enabled:-unknown}")" \
"$(escape_value "${active:-unknown}")"
}
echo "AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1"
echo "TARGET_HOSTS=110,120,121,188"
echo "GENERATED_AT=$(date '+%Y-%m-%dT%H:%M:%S%z')"
for spec in "${HOST_SPECS[@]}"; do
alias="${spec%%=*}"
rest="${spec#*=}"
target="${rest%%:*}"
unit="${rest#*:}"
probe_host "$alias" "$target" "$unit"
done

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env bash
# Boot-triggerable 10-minute reboot recovery SLO exporter.
#
# It is read-only: it probes boot state, runs the existing post-reboot summary,
# evaluates the SLO scorecard, and writes node-exporter textfile metrics.
set -uo pipefail
ROOT_DIR="${ROOT_DIR:-/opt/awoooi}"
TEXTFILE_DIR="${TEXTFILE_DIR:-${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}}"
LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}"
OUTPUT_NAME="${OUTPUT_NAME:-reboot_auto_recovery_slo.prom}"
TARGET_MINUTES="${TARGET_MINUTES:-10}"
MIN_FREE_GIB="${MIN_FREE_GIB:-2}"
LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-reboot-auto-recovery-slo.lock}"
if command -v flock >/dev/null 2>&1; then
exec 9>"$LOCK_FILE"
flock -n 9 || exit 0
fi
mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"
run_id="$(date '+%Y%m%d-%H%M%S')"
artifact_dir="$LOG_DIR/reboot-auto-recovery-slo-$run_id"
mkdir -p "$artifact_dir"
host_probe="$artifact_dir/host-probe.txt"
summary_file="$artifact_dir/summary.txt"
scorecard_file="$artifact_dir/scorecard.json"
bash "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" >"$host_probe" 2>&1 || true
ARTIFACT_DIR="$artifact_dir/post-reboot-readiness" \
bash "$ROOT_DIR/scripts/reboot-recovery/post-reboot-readiness-summary.sh" --no-color >"$summary_file" 2>&1 || true
python3 "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py" \
--summary-file "$summary_file" \
--host-probe-file "$host_probe" \
--target-minutes "$TARGET_MINUTES" \
--min-free-gib "$MIN_FREE_GIB" \
--disk-path / \
--output "$scorecard_file" || true
now="$(date +%s)"
ready="$(python3 - "$scorecard_file" <<'PY'
import json, sys
payload=json.load(open(sys.argv[1], encoding="utf-8"))
print(1 if payload.get("can_claim_all_services_recovered_within_target") else 0)
PY
)"
blocker_count="$(python3 - "$scorecard_file" <<'PY'
import json, sys
payload=json.load(open(sys.argv[1], encoding="utf-8"))
print(len(payload.get("active_blockers") or []))
PY
)"
max_uptime="$(python3 - "$scorecard_file" <<'PY'
import json, sys
payload=json.load(open(sys.argv[1], encoding="utf-8"))
print(payload.get("host_boot_detection", {}).get("max_observed_uptime_seconds", 0))
PY
)"
tmp_metric="$(mktemp "$TEXTFILE_DIR/.reboot_auto_recovery_slo.XXXXXX")"
cat >"$tmp_metric" <<METRICS
# HELP awoooi_reboot_auto_recovery_slo_ready Whether all P0 hosts auto-recovered within the target window.
# TYPE awoooi_reboot_auto_recovery_slo_ready gauge
awoooi_reboot_auto_recovery_slo_ready{scope="110_120_121_188",target_minutes="$TARGET_MINUTES"} $ready
# HELP awoooi_reboot_auto_recovery_slo_blocker_count Number of active fail-closed blockers for the reboot recovery SLO.
# TYPE awoooi_reboot_auto_recovery_slo_blocker_count gauge
awoooi_reboot_auto_recovery_slo_blocker_count{scope="110_120_121_188",target_minutes="$TARGET_MINUTES"} $blocker_count
# HELP awoooi_reboot_auto_recovery_slo_max_host_uptime_seconds Max observed host uptime in the boot probe.
# TYPE awoooi_reboot_auto_recovery_slo_max_host_uptime_seconds gauge
awoooi_reboot_auto_recovery_slo_max_host_uptime_seconds{scope="110_120_121_188"} $max_uptime
# HELP awoooi_reboot_auto_recovery_slo_last_run_timestamp Last SLO exporter run timestamp.
# TYPE awoooi_reboot_auto_recovery_slo_last_run_timestamp gauge
awoooi_reboot_auto_recovery_slo_last_run_timestamp{scope="110_120_121_188"} $now
METRICS
chmod 0644 "$tmp_metric"
mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME"

View File

@@ -0,0 +1,273 @@
#!/usr/bin/env python3
"""Score the 10-minute AWOOOI reboot auto-recovery SLO."""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[2]
SCHEMA_VERSION = "awoooi_reboot_auto_recovery_slo_scorecard_v1"
REQUIRED_HOSTS = {"110", "120", "121", "188"}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Fail-closed scorecard for automatic reboot recovery within 10 minutes.",
)
parser.add_argument("--summary-file", type=Path, help="post-reboot-readiness-summary output.")
parser.add_argument("--host-probe-file", type=Path, help="reboot-auto-recovery-host-probe output.")
parser.add_argument("--target-minutes", type=int, default=10)
parser.add_argument("--min-free-gib", type=float, default=2.0)
parser.add_argument("--disk-path", type=Path, help="Optionally check local free space.")
parser.add_argument("--generated-at", help="Override generated_at for stable snapshots.")
parser.add_argument("--output", type=Path, help="Write JSON to this path.")
return parser.parse_args()
def read_text(path: Path | None) -> str:
if not path:
return ""
return path.read_text(encoding="utf-8")
def parse_kv(text: str) -> dict[str, str]:
values: dict[str, str] = {}
for raw_line in text.splitlines():
line = raw_line.strip()
if not line or "=" not in line or line.startswith("HOST_BOOT "):
continue
key, value = line.split("=", 1)
values[key.strip()] = value.strip()
return values
def truthy(value: str | None) -> bool:
return value in {"1", "true", "True", "yes", "YES"}
def int_value(value: Any, default: int = 0) -> int:
try:
return int(str(value))
except (TypeError, ValueError):
return default
def parse_host_probe(text: str) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
for raw_line in text.splitlines():
line = raw_line.strip()
if not line.startswith("HOST_BOOT "):
continue
row: dict[str, Any] = {}
for token in line.split()[1:]:
if "=" not in token:
continue
key, value = token.split("=", 1)
row[key] = value
row["reachable"] = row.get("reachable") == "1"
row["uptime_seconds"] = int_value(row.get("uptime_seconds"), -1)
rows.append(row)
return rows
def source_file(path: str) -> Path:
return ROOT / path
def file_contains(path: Path, *needles: str) -> bool:
try:
text = path.read_text(encoding="utf-8")
except FileNotFoundError:
return False
return all(needle in text for needle in needles)
def source_controls() -> dict[str, bool]:
return {
"host_110_startup_unit_source_present": file_contains(
source_file("scripts/reboot-recovery/awoooi-startup-110.service"),
"ExecStart=/usr/local/bin/awoooi-startup-110.sh",
"WantedBy=multi-user.target",
)
and source_file("scripts/reboot-recovery/awoooi-startup-110.sh").exists(),
"host_188_startup_unit_source_present": file_contains(
source_file("scripts/reboot-recovery/awoooi-startup.service"),
"ExecStart=/usr/local/bin/awoooi-startup.sh",
"WantedBy=multi-user.target",
)
and source_file("scripts/reboot-recovery/awoooi-startup.sh").exists(),
"host_boot_probe_source_present": source_file(
"scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh"
).exists(),
"slo_exporter_source_present": source_file(
"scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh"
).exists(),
"slo_systemd_service_source_present": file_contains(
source_file("scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service"),
"ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh",
),
"slo_systemd_timer_source_present": file_contains(
source_file("scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer"),
"OnBootSec=",
"OnUnitActiveSec=",
),
"post_reboot_summary_source_present": source_file(
"scripts/reboot-recovery/post-reboot-readiness-summary.sh"
).exists(),
"cold_start_textfile_exporter_source_present": source_file(
"scripts/reboot-recovery/cold-start-textfile-exporter.sh"
).exists(),
}
def disk_free_gib(path: Path | None) -> float | None:
if path is None:
return None
completed = subprocess.run(
["df", "-k", str(path)],
check=False,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)
if completed.returncode != 0:
return None
lines = completed.stdout.splitlines()
if len(lines) < 2:
return None
parts = re.split(r"\s+", lines[-1].strip())
if len(parts) < 4:
return None
return int_value(parts[3]) / 1024 / 1024
def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
target_seconds = args.target_minutes * 60
summary = parse_kv(read_text(args.summary_file))
host_rows = parse_host_probe(read_text(args.host_probe_file))
controls = source_controls()
free_gib = disk_free_gib(args.disk_path)
blockers: list[str] = []
for key, value in controls.items():
if not value:
blockers.append(key.replace("_present", "_missing"))
host_aliases = {str(row.get("alias", "")) for row in host_rows}
missing_hosts = sorted(REQUIRED_HOSTS - host_aliases)
unreachable_hosts = sorted(str(row.get("alias")) for row in host_rows if not row.get("reachable"))
stale_hosts = sorted(
str(row.get("alias"))
for row in host_rows
if row.get("reachable") and int_value(row.get("uptime_seconds"), target_seconds + 1) > target_seconds
)
unknown_uptime_hosts = sorted(
str(row.get("alias"))
for row in host_rows
if row.get("reachable") and int_value(row.get("uptime_seconds"), -1) < 0
)
if not host_rows:
blockers.append("all_host_reboot_detection_missing")
if missing_hosts:
blockers.append("host_boot_probe_missing_hosts")
if unreachable_hosts:
blockers.append("host_unreachable_after_reboot")
if stale_hosts:
blockers.append("host_boot_observation_older_than_target_window")
if unknown_uptime_hosts:
blockers.append("host_uptime_unknown")
service_green = truthy(summary.get("SERVICE_GREEN"))
product_data_green = truthy(summary.get("PRODUCT_DATA_GREEN"))
backup_core_green = truthy(summary.get("BACKUP_CORE_GREEN"))
post_start_blocked = int_value(summary.get("POST_START_BLOCKED"), 999)
wazuh_dashboard_degraded = truthy(summary.get("WAZUH_DASHBOARD_DEGRADED"))
host_188_service_green = truthy(summary.get("HOST_188_SERVICE_GREEN"))
if not summary:
blockers.append("post_reboot_summary_missing")
if post_start_blocked != 0:
blockers.append("post_start_blocked_not_zero")
if not service_green:
blockers.append("service_green_not_1")
if not product_data_green:
blockers.append("product_data_green_not_1")
if not backup_core_green:
blockers.append("backup_core_green_not_1")
if not host_188_service_green:
blockers.append("host_188_service_green_not_1")
if wazuh_dashboard_degraded:
blockers.append("wazuh_dashboard_degraded")
if free_gib is not None and free_gib < args.min_free_gib:
blockers.append("local_disk_free_below_minimum")
max_uptime = max(
[int_value(row.get("uptime_seconds"), 0) for row in host_rows if row.get("reachable")]
or [0]
)
can_claim = not blockers
return {
"schema_version": SCHEMA_VERSION,
"generated_at": args.generated_at
or datetime.now().astimezone().isoformat(timespec="seconds"),
"target_minutes": args.target_minutes,
"target_seconds": target_seconds,
"status": "slo_ready" if can_claim else "blocked_reboot_auto_recovery_slo_not_ready",
"can_claim_all_services_recovered_within_target": can_claim,
"source_controls": controls,
"host_boot_detection": {
"required_hosts": sorted(REQUIRED_HOSTS),
"observed_hosts": sorted(host_aliases),
"missing_hosts": missing_hosts,
"unreachable_hosts": unreachable_hosts,
"stale_hosts": stale_hosts,
"unknown_uptime_hosts": unknown_uptime_hosts,
"max_observed_uptime_seconds": max_uptime,
"host_rows": host_rows,
},
"post_reboot_readiness": {
"summary_present": bool(summary),
"post_start_result": summary.get("POST_START_RESULT", "unknown"),
"post_start_blocked": post_start_blocked,
"service_green": service_green,
"product_data_green": product_data_green,
"backup_core_green": backup_core_green,
"host_188_service_green": host_188_service_green,
"wazuh_dashboard_degraded": wazuh_dashboard_degraded,
"overall_declaration": summary.get("OVERALL_DECLARATION", "unknown"),
"next_required_gates": summary.get("NEXT_REQUIRED_GATES", "unknown"),
},
"capacity": {
"checked": free_gib is not None,
"free_gib": round(free_gib, 3) if free_gib is not None else None,
"min_free_gib": args.min_free_gib,
},
"active_blockers": sorted(set(blockers)),
"safe_next_step": (
"deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_"
"rerun_scorecard_until_status_slo_ready"
),
}
def main() -> int:
args = parse_args()
payload = build_scorecard(args)
text = json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
if args.output:
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(text, encoding="utf-8")
else:
print(text, end="")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,88 @@
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[3]
SCRIPT = ROOT / "scripts" / "reboot-recovery" / "reboot-auto-recovery-slo-scorecard.py"
GREEN_SUMMARY = """\
AWOOOI_POST_REBOOT_READINESS_SUMMARY=1
POST_START_RESULT=FULL_STACK_GREEN
POST_START_BLOCKED=0
SERVICE_GREEN=1
PRODUCT_DATA_GREEN=1
BACKUP_CORE_GREEN=1
HOST_188_SERVICE_GREEN=1
WAZUH_DASHBOARD_DEGRADED=0
OVERALL_DECLARATION=FULL_STACK_GREEN
NEXT_REQUIRED_GATES=none
"""
HOST_PROBE_GREEN = """\
AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1
TARGET_HOSTS=110,120,121,188
HOST_BOOT alias=110 target=wooo@192.168.0.110 startup_unit=awoooi-startup-110.service reachable=1 boot_id=a uptime_seconds=120 systemd_state=running startup_enabled=enabled startup_active=active
HOST_BOOT alias=120 target=wooo@192.168.0.120 startup_unit=k3s.service reachable=1 boot_id=b uptime_seconds=130 systemd_state=running startup_enabled=enabled startup_active=active
HOST_BOOT alias=121 target=wooo@192.168.0.121 startup_unit=k3s.service reachable=1 boot_id=c uptime_seconds=140 systemd_state=running startup_enabled=enabled startup_active=active
HOST_BOOT alias=188 target=ollama@192.168.0.188 startup_unit=awoooi-startup.service reachable=1 boot_id=d uptime_seconds=150 systemd_state=running startup_enabled=enabled startup_active=active
"""
def run_scorecard(tmp_path: Path, summary: str, probe: str = HOST_PROBE_GREEN) -> dict:
summary_path = tmp_path / "summary.txt"
probe_path = tmp_path / "probe.txt"
summary_path.write_text(summary, encoding="utf-8")
probe_path.write_text(probe, encoding="utf-8")
result = subprocess.run(
[
sys.executable,
str(SCRIPT),
"--summary-file",
str(summary_path),
"--host-probe-file",
str(probe_path),
"--generated-at",
"2026-06-29T14:30:00+08:00",
],
text=True,
capture_output=True,
check=True,
)
return json.loads(result.stdout)
def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) -> None:
payload = run_scorecard(tmp_path, GREEN_SUMMARY)
assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1"
assert payload["status"] == "slo_ready"
assert payload["can_claim_all_services_recovered_within_target"] is True
assert payload["host_boot_detection"]["max_observed_uptime_seconds"] == 150
assert payload["active_blockers"] == []
def test_missing_probe_fails_closed(tmp_path: Path) -> None:
payload = run_scorecard(tmp_path, GREEN_SUMMARY, probe="")
assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready"
assert payload["can_claim_all_services_recovered_within_target"] is False
assert "all_host_reboot_detection_missing" in payload["active_blockers"]
assert "host_boot_probe_missing_hosts" in payload["active_blockers"]
def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> None:
summary = GREEN_SUMMARY.replace("WAZUH_DASHBOARD_DEGRADED=0", "WAZUH_DASHBOARD_DEGRADED=1")
probe = HOST_PROBE_GREEN.replace("uptime_seconds=150", "uptime_seconds=900")
payload = run_scorecard(tmp_path, summary, probe=probe)
assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready"
assert payload["can_claim_all_services_recovered_within_target"] is False
assert "wazuh_dashboard_degraded" in payload["active_blockers"]
assert "host_boot_observation_older_than_target_window" in payload["active_blockers"]