diff --git a/apps/api/src/services/awoooi_priority_work_order_readback.py b/apps/api/src/services/awoooi_priority_work_order_readback.py index e413336b..4a9d1519 100644 --- a/apps/api/src/services/awoooi_priority_work_order_readback.py +++ b/apps/api/src/services/awoooi_priority_work_order_readback.py @@ -459,9 +459,9 @@ _COMMANDER_INSERTED_REQUIREMENT_WORK_ITEMS: list[dict[str, Any]] = [ "lane": "reboot_runbook_fixed_order", "request": "每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA。", "normalized_work_item": "固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command。", - "current_state": "部分已有 scorecard / SOP;仍需所有回報統一格式。", + "current_state": "scorecard source / snapshot / API 已補 current_phase、eta_or_wait_reason、primary_blocker、next_safe_action;仍需 production deploy readback 與下一次 fresh reboot/drill 證明。", "acceptance": "SLO scorecard 強制輸出 current_phase、eta_or_wait_reason、active_blockers、next_safe_action。", - "next_action": "將 reboot SLO scorecard 補齊固定輸出欄位。", + "next_action": "推送後讀回 /api/v1/agents/reboot-auto-recovery-slo-scorecard,確認固定欄位在 production 可見;再接 99/VMware 與 fresh reboot/drill。", "mapped_workplan_id": "P0-006-REBOOT-AUTO-RECOVERY-SLO-SCORECARD", }, { diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 3e8ea589..46133655 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -83,6 +83,12 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "host_boot_observation_older_than_target_window" ] latest_verify_only_metric_present = bool(latest_verify_metric) + sop_progress = _build_reboot_sop_progress( + scorecard=scorecard, + active_blockers=active_blockers, + readiness_percent=readiness_percent, + can_claim_slo=can_claim_slo, + ) rollups = { "active_blocker_count": active_blocker_count, "readiness_percent": readiness_percent, @@ -137,7 +143,11 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "priority": "P0-006", "scope": "reboot_auto_recovery_slo_scorecard", "status": str(scorecard.get("status") or "unknown"), + "current_phase": sop_progress["current_phase"], + "eta_or_wait_reason": sop_progress["eta_or_wait_reason"], "safe_next_step": safe_next_step, + "next_safe_action": sop_progress["next_safe_action"], + "primary_blocker": sop_progress["primary_blocker"], "can_claim_all_services_recovered_within_target": can_claim_slo, "active_blocker_count": active_blocker_count, "readiness_percent": readiness_percent, @@ -168,12 +178,17 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", "source_scorecard_ref": f"docs/operations/{path.name}", "target_minutes": _int(scorecard.get("target_minutes")), + "current_phase": sop_progress["current_phase"], + "eta_or_wait_reason": sop_progress["eta_or_wait_reason"], "safe_next_step": safe_next_step, + "next_safe_action": sop_progress["next_safe_action"], + "primary_blocker": sop_progress["primary_blocker"], "active_blocker_count": active_blocker_count, "readiness_percent": readiness_percent, "blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only, "latest_verify_only_metric_present": latest_verify_only_metric_present, }, + "reboot_sop_progress": sop_progress, "host_boot_detection": host_boot_detection, "post_reboot_readiness": post_reboot_readiness, "stockplatform_data_freshness": stockplatform, @@ -196,6 +211,140 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: } +def _build_reboot_sop_progress( + *, + scorecard: dict[str, Any], + active_blockers: list[str], + readiness_percent: int, + can_claim_slo: bool, +) -> dict[str, Any]: + current_phase = _reboot_sop_current_phase(active_blockers, can_claim_slo) + primary_blocker = _reboot_sop_primary_blocker(active_blockers) + eta_or_wait_reason = _reboot_sop_eta_or_wait_reason( + scorecard=scorecard, + active_blockers=active_blockers, + current_phase=current_phase, + primary_blocker=primary_blocker, + ) + sla_eta = _dict(scorecard.get("sla_recovery_eta")) + return { + "current_phase": current_phase, + "eta_or_wait_reason": eta_or_wait_reason, + "primary_blocker": primary_blocker, + "active_blockers": active_blockers, + "active_blocker_count": len(active_blockers), + "readiness_percent": readiness_percent, + "next_safe_action": str( + scorecard.get("next_safe_action") + or scorecard.get("safe_next_step") + or "" + ), + "fixed_triage_order": _strings(sla_eta.get("fixed_triage_order")), + } + + +def _reboot_sop_current_phase(active_blockers: list[str], can_claim_slo: bool) -> str: + if can_claim_slo and not active_blockers: + return "slo_ready" + if active_blockers == ["host_boot_observation_older_than_target_window"]: + return "awaiting_next_reboot_or_approved_drill" + host_boot_blockers = { + "all_host_reboot_detection_missing", + "stateful_reboot_event_detection_missing", + "host_boot_probe_missing_hosts", + "host_unreachable_after_reboot", + "host_boot_observation_older_than_target_window", + "host_uptime_unknown", + "reboot_event_missing_required_hosts", + "reboot_event_required_host_unreachable", + "fresh_all_host_reboot_event_missing", + "all_required_hosts_not_in_10_minute_reboot_window", + } + if any(blocker in host_boot_blockers for blocker in active_blockers): + return "host_boot_detection_blocked" + service_blockers = { + "post_reboot_summary_missing", + "post_start_blocked_not_zero", + "service_green_not_1", + "host_188_service_green_not_1", + "wazuh_dashboard_degraded", + } + if any(blocker in service_blockers for blocker in active_blockers): + return "post_reboot_service_readiness_blocked" + if any( + "stockplatform" in blocker or "product_data" in blocker + for blocker in active_blockers + ): + return "product_data_freshness_blocked" + if any("backup" in blocker for blocker in active_blockers): + return "backup_readback_blocked" + if "local_disk_free_below_minimum" in active_blockers: + return "host_capacity_blocked" + if any( + blocker.startswith("host_") or blocker.startswith("awooop_") + for blocker in active_blockers + ): + return "host_pressure_blocked" + return "slo_blocked" + + +def _reboot_sop_primary_blocker(active_blockers: list[str]) -> str: + priority = [ + "reboot_event_required_host_unreachable", + "host_unreachable_after_reboot", + "all_host_reboot_detection_missing", + "stateful_reboot_event_detection_missing", + "host_boot_probe_missing_hosts", + "fresh_all_host_reboot_event_missing", + "all_required_hosts_not_in_10_minute_reboot_window", + "host_boot_observation_older_than_target_window", + "host_uptime_unknown", + "post_start_blocked_not_zero", + "service_green_not_1", + "host_188_service_green_not_1", + "product_data_green_not_1", + "stockplatform_freshness_blocked", + "stockplatform_ingestion_blocked", + "backup_core_green_not_1", + "local_disk_free_below_minimum", + "wazuh_dashboard_degraded", + ] + for blocker in priority: + if blocker in active_blockers: + return blocker + return active_blockers[0] if active_blockers else "" + + +def _reboot_sop_eta_or_wait_reason( + *, + scorecard: dict[str, Any], + active_blockers: list[str], + current_phase: str, + primary_blocker: str, +) -> str: + if current_phase == "slo_ready": + return "recovered_within_10_minute_slo" + sla_eta = _dict(scorecard.get("sla_recovery_eta")) + reboot_event = _dict(scorecard.get("reboot_event_detection")) + remaining = _int(sla_eta.get("target_seconds_remaining")) + deadline_status = str(sla_eta.get("deadline_status") or "unknown") + if remaining > 0: + return ( + f"target_window_remaining_{remaining}s_but_blocked_by_" + f"{primary_blocker or 'unknown'}" + ) + if reboot_event.get("readback_present") is not True: + return "reboot_event_readback_missing_eta_unavailable" + if deadline_status == "target_window_elapsed": + return ( + "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_" + "event_and_probe" + ) + if active_blockers: + return f"eta_unavailable_until_primary_blocker_clears:{primary_blocker}" + return "eta_unavailable" + + def _require_operation_boundaries(payload: dict[str, Any], label: str) -> None: boundaries = _dict(payload.get("operation_boundaries")) forbidden_true = [ diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 183ba3a0..a66cc0b2 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -77,10 +77,17 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["schema_version"] == "reboot_auto_recovery_slo_scorecard_readback_v1" assert payload["priority"] == "P0-006" assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready" + assert payload["current_phase"] == "host_boot_detection_blocked" + assert payload["eta_or_wait_reason"] == ( + "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_" + "event_and_probe" + ) + assert payload["primary_blocker"] == "reboot_event_required_host_unreachable" assert payload["safe_next_step"] == ( "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_" "rerun_scorecard_until_status_slo_ready" ) + assert payload["next_safe_action"] == payload["safe_next_step"] assert payload["can_claim_all_services_recovered_within_target"] is False assert payload["active_blocker_count"] == 12 assert payload["readiness_percent"] == 18 @@ -105,10 +112,19 @@ def _assert_reboot_slo_payload(payload: dict): ] is True assert payload["readback"]["workplan_id"] == "P0-006" assert payload["readback"]["target_minutes"] == 10 + assert payload["readback"]["current_phase"] == "host_boot_detection_blocked" + assert payload["readback"]["eta_or_wait_reason"] == ( + "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_" + "event_and_probe" + ) + assert payload["readback"]["primary_blocker"] == ( + "reboot_event_required_host_unreachable" + ) assert payload["readback"]["safe_next_step"] == ( "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_" "rerun_scorecard_until_status_slo_ready" ) + assert payload["readback"]["next_safe_action"] == payload["safe_next_step"] assert payload["readback"]["active_blocker_count"] == 12 assert payload["readback"]["readiness_percent"] == 18 assert payload["readback"]["blocked_by_fresh_reboot_window_only"] is False @@ -137,6 +153,17 @@ def _assert_reboot_slo_payload(payload: dict): is False ) assert payload["active_blockers"] == EXPECTED_REBOOT_SLO_BLOCKERS + progress = payload["reboot_sop_progress"] + assert progress["current_phase"] == "host_boot_detection_blocked" + assert progress["eta_or_wait_reason"] == ( + "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_" + "event_and_probe" + ) + assert progress["primary_blocker"] == "reboot_event_required_host_unreachable" + assert progress["active_blocker_count"] == 12 + assert progress["readiness_percent"] == 18 + assert progress["next_safe_action"] == payload["safe_next_step"] + assert progress["fixed_triage_order"][0] == "99_vmware_autostart_and_vm_power" stockplatform = payload["stockplatform_data_freshness"] assert stockplatform["freshness_endpoint_readback_present"] is True assert stockplatform["ingestion_endpoint_readback_present"] is True diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3b852af2..3cdfa368 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -16,6 +16,17 @@ **仍維持**: - 未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未送 Telegram;未對 production DB 寫入。 +## 2026-07-02 — 13:52 reboot SOP 固定 phase / ETA / blocker readback + +**完成內容**: +- `scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py` 補齊固定輸出欄位:`current_phase`、`eta_or_wait_reason`、`primary_blocker`、`next_safe_action`、`reboot_sop_progress`,避免每次主機重啟後重新臨場發明排查順序。 +- `apps/api/src/services/reboot_auto_recovery_slo_scorecard.py` 即使讀 committed snapshot,也會輸出相同欄位;目前 snapshot 讀回固定為 `current_phase=host_boot_detection_blocked`、`primary_blocker=reboot_event_required_host_unreachable`、`eta_or_wait_reason=target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe`。 +- `docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json`、`docs/runbooks/FULL-STACK-COLD-START-SOP.md` v1.93、`docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md` 與 AwoooP priority readback source 已同步,`CIR-P0-RBT-008` 不再停留在「缺固定回報欄位」。 + +**仍維持**: +- 這是 SOP / scorecard / API readback 第一批閉環,不是 fresh all-host reboot/drill 證明;仍不可宣稱 10 分鐘自動恢復 SLA 已成立。 +- 未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未觸發 workflow;未重啟主機 / Docker / Nginx / K3s / DB / firewall。 + ## 2026-07-02 — 13:35 Telegram 告警 egress 全量 live readback **完成內容**: diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index f164d61d..86c15aa4 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -19,6 +19,8 @@ "free_gib": 0.751, "min_free_gib": 2.0 }, + "current_phase": "host_boot_detection_blocked", + "eta_or_wait_reason": "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe", "generated_at": "2026-06-30T20:44:37+08:00", "host_boot_detection": { "host_rows": [ @@ -134,6 +136,7 @@ "111" ] }, + "next_safe_action": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", "post_reboot_readiness": { "backup_core_green": false, "host_188_service_green": false, @@ -163,7 +166,11 @@ "wazuh_dashboard_degraded" ], "blocked_by_fresh_reboot_window_only": false, + "current_phase": "host_boot_detection_blocked", + "eta_or_wait_reason": "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe", "host_reboot_authorized_by_this_scorecard": false, + "next_safe_action": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", + "primary_blocker": "reboot_event_required_host_unreachable", "readiness_percent": 18, "required_checks": { "backup_core_green": false, @@ -189,6 +196,7 @@ "workplan_id": "P0-006", "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO" }, + "primary_blocker": "reboot_event_required_host_unreachable", "reboot_event_detection": { "all_required_hosts_in_reboot_window": false, "all_required_hosts_observed": false, @@ -208,6 +216,37 @@ "111" ] }, + "reboot_sop_progress": { + "active_blocker_count": 12, + "active_blockers": [ + "all_required_hosts_not_in_10_minute_reboot_window", + "backup_core_green_not_1", + "host_188_service_green_not_1", + "host_boot_observation_older_than_target_window", + "host_unreachable_after_reboot", + "host_uptime_unknown", + "local_disk_free_below_minimum", + "post_start_blocked_not_zero", + "product_data_green_not_1", + "reboot_event_required_host_unreachable", + "service_green_not_1", + "wazuh_dashboard_degraded" + ], + "current_phase": "host_boot_detection_blocked", + "eta_or_wait_reason": "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe", + "fixed_triage_order": [ + "99_vmware_autostart_and_vm_power", + "host_boot_event_and_node_exporter", + "public_routes_and_maintenance_fallback", + "awoooi_k3s_workloads_and_registry", + "stockplatform_public_api_and_freshness", + "backup_health_and_offsite_evidence", + "telegram_alert_delivery_readback" + ], + "next_safe_action": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", + "primary_blocker": "reboot_event_required_host_unreachable", + "readiness_percent": 18 + }, "required_checks": { "backup_core_green": false, "can_claim_slo": false, @@ -354,11 +393,15 @@ "reboot_auto_recovery_active_blocker_count": 12, "reboot_auto_recovery_backup_core_green": false, "reboot_auto_recovery_can_claim_slo": false, + "reboot_auto_recovery_current_phase": "host_boot_detection_blocked", + "reboot_auto_recovery_eta_or_wait_reason": "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe", "reboot_auto_recovery_host_188_service_green": false, "reboot_auto_recovery_observed_host_count": 7, "reboot_auto_recovery_product_data_green": false, + "reboot_auto_recovery_primary_blocker": "reboot_event_required_host_unreachable", "reboot_auto_recovery_readiness_percent": 18, "reboot_auto_recovery_safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", + "reboot_auto_recovery_next_safe_action": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", "reboot_auto_recovery_service_green": false, "reboot_auto_recovery_source_controls_present": true, "reboot_auto_recovery_stale_host_count": 5, diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 81021e74..9cc57540 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.92 +> Version: v1.93 > Last updated: 2026-07-02 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -18,6 +18,8 @@ v1.79 active owner response template rule:同一輪 owner packet 產生後,p v1.80 / v1.81 credential escrow intake scorecard rule:同一輪 owner response preflight 後,必須用 `scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py --summary-file "$ARTIFACT_DIR/summary.txt" --owner-packet-file --response-file --offsite-report-file --escrow-status-file ` 收斂 DR escrow gate。scorecard 只讀 sanitized artifacts;不得讀 secret value、不得寫 marker、不得送 owner request、不得開 runtime gate。placeholder readback 期望 `STATUS=blocked_waiting_non_secret_credential_escrow_evidence`、`EFFECTIVE_ESCROW_MISSING_COUNT=5`、`OWNER_RESPONSE_RECEIVED_COUNT=0`、`OWNER_RESPONSE_ACCEPTED_COUNT=0`、`RUNTIME_GATE_COUNT=0`、`CREDENTIAL_MARKER_WRITE_AUTHORIZED_COUNT=0`。若未來收到合格 redacted owner response 並由 preflight 回 `ready_for_independent_reviewer_acceptance`,scorecard 應轉為 `STATUS=ready_for_independent_reviewer_acceptance`;即使 marker 尚未寫入,也只能進 `independent_reviewer_acceptance_then_marker_dry_run`,不得直接寫 marker 或宣稱 `DR_COMPLETE`。 +v1.93 reboot SLA readback fixed fields rule:`scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py`、`docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json` 與 `/api/v1/agents/reboot-auto-recovery-slo-scorecard` 必須固定輸出 `current_phase`、`eta_or_wait_reason`、`primary_blocker`、`active_blockers`、`next_safe_action` 與 `reboot_sop_progress.fixed_triage_order`。回報時先讀這些欄位,不得重新臨場發明排查順序;若 `current_phase=host_boot_detection_blocked` 且 `eta_or_wait_reason=target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe`,只能宣稱「10 分鐘 SLO 尚未證明」,下一步固定補 fresh all-host reboot event / probe 與 99 VMware/Windows readback,不得把部分 Linux host green 當全主機 green。 + 2026-07-02 110 control-path / Harbor recovery receipt rule:若 Gitea Harbor repair queue 仍保留 `harbor_110_remote_ssh_publickey_auth_stalled`、remote-control unavailable、jobs stale 或 historical failure,但同一輪本地證據同時證明 `wooo` command path ready、110 local Harbor `/v2/` ready、public/internal registry `/v2/` 回 `401`,則該 Gitea Harbor repair 失敗只能列為 historical queue metadata,不得再當成 current SSH blocker。必須用 `/api/v1/agents/harbor-registry-controlled-recovery-receipt` 或同等 validator 合併 `diagnose-110-ssh-publickey-auth.sh`、`recover-110-control-path-and-harbor-local.sh --check`、public Gitea queue readback 與 registry `/v2/` verifier,並把機器可讀結果寫入 `docs/operations/harbor-110-control-path-recovery-readback-2026-07-02.snapshot.json` 類型的 snapshot。2026-07-02 live receipt 顯示:public/internal registry `/v2/` 均為 `401`、latest visible CD `#4335` 為 `Success`、Gitea Harbor repair failure 已是 `historical_after_latest_cd_success=true`;active blockers 收斂為 110 controlled CD lane config / binary / registration / service guardrail、active action container pressure,以及 Gitea CD jobs head-SHA / stale readback mismatch。若 local-console output 只有 `AWOOOI_110_CONTROLLED_CD_LANE_READY` marker,non110 runner parser 不得從 110 `BLOCKER` 行推導 non110 blocker;non110 只有看到 `AWOOOI_NON110_RUNNER_READY` marker 才能列入 active blocker。 2026-07-02 110 controlled CD lane fail-closed enforcer staging rule:110 runner 壓力事故後,legacy / generic runner 仍必須 fail-closed;但 `awoooi-cd-lane-drain.service` 的非 secret staging artifact 不得再被 enforcer 無差別封回 stub。`scripts/reboot-recovery/enforce-110-runner-failclosed.sh` 只有在 `config.yaml` 符合 `capacity <= 1`、只含 `awoooi-host:host` 與 `awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04`、binary 是 executable ELF、systemd unit 具備 `ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner`、`CPUAccounting` / `MemoryAccounting` / `TasksAccounting` / `NoNewPrivileges` 等 guardrail,且 service `inactive`、`MainPID=0`、未 enabled / 未 masked 時,才可保留 drain config / binary / unit,並輸出 `CONTROLLED_DRAIN_STAGING_ALLOWED=1` 與 textfile metric。此 staging 規則不得讀 token、不得讀 `.runner` 內容、不得註冊 runner、不得啟動 service;若 registration 缺失,readiness verifier 仍必須只留下 `controlled_cd_lane_registration_missing` / `controlled_cd_lane_service_not_active` 類 blocker。若 `CONTROLLED_DRAIN_STAGING_ALLOWED=0` 且 config / binary 又被搬走,優先修 source enforcer / unit guardrail,不要手工反覆補同一組 artifact。 diff --git a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md index 2806ca03..e64459c7 100644 --- a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md +++ b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md @@ -64,7 +64,7 @@ | 5 | CIR-P0-RBT-005 | P0 | 「網站重啟後 502 嚴重影響體驗,要維護頁,外部雲端或專業做法」 | Public maintenance fallback:Nginx / edge / external static maintenance page / status page / fail-open UX,避免 502 直出 | 尚未完整落地;目前是需求缺口 | 產生 `public_maintenance_fallback` decision record:DNS/edge/外部雲端/本地 Nginx fallback 風險比較,先做不切流量的 check-mode | | 6 | CIR-P0-RBT-006 | P0 | 「所有主機關機立刻 Telegram 告警,重啟後也要告警,其他告警一併完整思考」 | Down / shutdown suspected / reboot detected / reboot recovered / SLO missed / backup failed / freshness stale / CPU pressure / Gitea queue 告警矩陣 | 部分已有 Alertmanager rule 與 Telegram receipt 補強;仍缺完整 shutdown/up E2E receipt | 建立 Telegram alert matrix + receipt verifier,逐項讀回 Alertmanager active/resolved 與 outbound receipt,不送測試 secret | | 7 | CIR-P0-RBT-007 | P0 | 「所有備份包含主機、DB、網站、服務、套件、工具、日誌都沒有監控告警」 | Backup observability coverage:backup job inventory、last success、freshness、offsite、restore drill、Telegram receipt | 部分已有 backup health exporter / alert rules;全域 coverage 與 restore drill 未全綠 | 建立 backup coverage matrix:host / DB / website / service config / package list / tool scripts / logs,每列有 metric、alert、last_success、restore_verifier | -| 8 | CIR-P0-RBT-008 | P0 | 「每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA」 | 固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command | 部分已有 scorecard / SOP;仍需所有回報統一格式 | 將 SLO scorecard 強制輸出 `current_phase`、`eta_or_wait_reason`、`active_blockers`、`next_safe_action` | +| 8 | CIR-P0-RBT-008 | P0 | 「每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA」 | 固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command | 已補 scorecard source / snapshot / API 固定欄位:`current_phase`、`eta_or_wait_reason`、`primary_blocker`、`next_safe_action`;仍需 production deploy readback 與下一次 fresh reboot/drill 證明 | 推送後讀回 `/api/v1/agents/reboot-auto-recovery-slo-scorecard`,確認固定欄位在 production 可見;再接 99/VMware 與 fresh reboot/drill | | 9 | CIR-P0-RBT-009 | P0 | 「所有產品、網站都要是最新版本;版本和數據是否最新要驗證」 | Product freshness/version matrix:source commit、deploy marker、runtime image、public health、data freshness、latest source availability | AWOOOI / StockPlatform 部分已在做;全產品未統一 | 建立全產品 readback 表:product、canonical repo、main SHA、deploy marker、public URL、data freshness、blocked reason | | 10 | CIR-P0-GIT-001 | P0 | 「Gitea 儲存庫都不見了?Gitea 沒完整備份嗎?」 | Gitea repository identity + backup proof + restore drill:不能只看 UI visible,要比對 SSH heads、repo path、bundle backup、restore sample | 已有 9 expected repos OK / backup health missing=0 的 handoff;仍需 restore drill 證明 | 補 Gitea repo bundle backup readback + sample restore dry-run verifier;禁止刪 repo / 改 visibility | | 11 | CIR-P0-CPU-001 | P0 | 「110 / 188 CPU 負載持續過高,為什麼沒監控告警、沒主動修復」 | Sustained CPU pressure automation:Alertmanager → controller → evidence → service playbook → verifier → KM writeback | 110 已有 `Host110SustainedModeratePressure`、Gitea playbook、Stock/Postgres evidence;188 仍需同級 controller/alerts readback | 下一步接 `postgres_hot_query_or_backup_export_playbook`;並補 188 equivalent readback,不以單次下降結案 | @@ -103,7 +103,7 @@ | KM / PlayBook / RAG / MCP 整合 | 已被列為 P1,不再遺漏 | 建立 work item schema 與 trust writeback 欄位 | | OpenClaw / Gather-style 持續動畫工作室 | route 已存在,已列為 P1 工作項 | 補 production desktop/mobile smoke、AwoooP 導流與截圖證據 | | AI 專業 UI / 非文字牆 cockpit | 已列為 P2 UX 驗收 | 將長文字區塊收斂成 first-viewport cockpit、cards、flow rows 與 expandable details | -| 10 分鐘 reboot auto-recovery SLA | SLO exporter / alerts 部分存在,但缺 fresh all-host reboot/drill proof | 補最新 scorecard readback,缺事件則明確標等待下一次 reboot 或 approved drill | +| 10 分鐘 reboot auto-recovery SLA | scorecard source / snapshot / API 已補固定 phase / ETA / blocker / next action 欄位;仍缺 fresh all-host reboot/drill proof | 補 production API readback,缺事件則明確標等待下一次 reboot 或 approved drill | | 99 Windows / VMware autostart | 尚未完成 99 host + VM 111/188/120/121/112 autostart verifier | 建立 non-secret VMware/Windows verifier,不讀密碼 | | 502 maintenance fallback | 尚未完成外部維護頁 / edge fallback 決策與實作 | 先做 no-write decision record + smoke verifier | | 全備份監控告警 coverage | 部分 exporter/rule 已存在,但 host/DB/site/service/package/tool/log coverage 未全列 | 建立 backup coverage matrix 與 restore drill verifier | diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index ac3224c3..80987349 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -635,6 +635,131 @@ def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]: } +def reboot_sop_current_phase(active_blockers: list[str], can_claim: bool) -> str: + if can_claim and not active_blockers: + return "slo_ready" + if active_blockers == ["host_boot_observation_older_than_target_window"]: + return "awaiting_next_reboot_or_approved_drill" + host_boot_blockers = { + "all_host_reboot_detection_missing", + "stateful_reboot_event_detection_missing", + "host_boot_probe_missing_hosts", + "host_unreachable_after_reboot", + "host_boot_observation_older_than_target_window", + "host_uptime_unknown", + "reboot_event_missing_required_hosts", + "reboot_event_required_host_unreachable", + "fresh_all_host_reboot_event_missing", + "all_required_hosts_not_in_10_minute_reboot_window", + } + if any(blocker in host_boot_blockers for blocker in active_blockers): + return "host_boot_detection_blocked" + service_blockers = { + "post_reboot_summary_missing", + "post_start_blocked_not_zero", + "service_green_not_1", + "host_188_service_green_not_1", + "wazuh_dashboard_degraded", + } + if any(blocker in service_blockers for blocker in active_blockers): + return "post_reboot_service_readiness_blocked" + if any("stockplatform" in blocker or "product_data" in blocker for blocker in active_blockers): + return "product_data_freshness_blocked" + if any("backup" in blocker for blocker in active_blockers): + return "backup_readback_blocked" + if "local_disk_free_below_minimum" in active_blockers: + return "host_capacity_blocked" + if any(blocker.startswith("host_") or blocker.startswith("awooop_") for blocker in active_blockers): + return "host_pressure_blocked" + return "slo_blocked" + + +def reboot_sop_primary_blocker(active_blockers: list[str]) -> str: + priority = [ + "reboot_event_required_host_unreachable", + "host_unreachable_after_reboot", + "all_host_reboot_detection_missing", + "stateful_reboot_event_detection_missing", + "host_boot_probe_missing_hosts", + "fresh_all_host_reboot_event_missing", + "all_required_hosts_not_in_10_minute_reboot_window", + "host_boot_observation_older_than_target_window", + "host_uptime_unknown", + "post_start_blocked_not_zero", + "service_green_not_1", + "host_188_service_green_not_1", + "product_data_green_not_1", + "stockplatform_freshness_blocked", + "stockplatform_ingestion_blocked", + "backup_core_green_not_1", + "local_disk_free_below_minimum", + "wazuh_dashboard_degraded", + ] + for blocker in priority: + if blocker in active_blockers: + return blocker + return active_blockers[0] if active_blockers else "" + + +def reboot_sop_eta_or_wait_reason( + payload: dict[str, Any], + active_blockers: list[str], + current_phase: str, + primary_blocker: str, +) -> str: + if current_phase == "slo_ready": + return "recovered_within_10_minute_slo" + sla_eta = payload.get("sla_recovery_eta") + if not isinstance(sla_eta, dict): + sla_eta = {} + reboot_event = payload.get("reboot_event_detection") + if not isinstance(reboot_event, dict): + reboot_event = {} + remaining = int_value(sla_eta.get("target_seconds_remaining"), 0) + deadline_status = str(sla_eta.get("deadline_status") or "unknown") + if remaining > 0: + return f"target_window_remaining_{remaining}s_but_blocked_by_{primary_blocker or 'unknown'}" + if reboot_event.get("readback_present") is not True: + return "reboot_event_readback_missing_eta_unavailable" + if deadline_status == "target_window_elapsed": + return ( + "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_" + "event_and_probe" + ) + if active_blockers: + return f"eta_unavailable_until_primary_blocker_clears:{primary_blocker}" + return "eta_unavailable" + + +def build_reboot_sop_progress( + payload: dict[str, Any], + active_blockers: list[str], + readiness_percent: int, +) -> dict[str, Any]: + can_claim = payload.get("can_claim_all_services_recovered_within_target") is True + current_phase = reboot_sop_current_phase(active_blockers, can_claim) + primary_blocker = reboot_sop_primary_blocker(active_blockers) + eta_or_wait_reason = reboot_sop_eta_or_wait_reason( + payload, + active_blockers, + current_phase, + primary_blocker, + ) + sla_eta = payload.get("sla_recovery_eta") + if not isinstance(sla_eta, dict): + sla_eta = {} + return { + "current_phase": current_phase, + "eta_or_wait_reason": eta_or_wait_reason, + "primary_blocker": primary_blocker, + "active_blockers": active_blockers, + "active_blocker_count": len(active_blockers), + "readiness_percent": readiness_percent, + "next_safe_action": str(payload.get("safe_next_step") or ""), + "fixed_triage_order": strings(sla_eta.get("fixed_triage_order")), + } + + def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: """Add machine-readable P0-006 readback fields to the source scorecard.""" active_blockers = strings(payload.get("active_blockers")) @@ -671,6 +796,7 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: blocked_by_fresh_reboot_window_only = active_blockers == [ "host_boot_observation_older_than_target_window" ] + sop_progress = build_reboot_sop_progress(payload, active_blockers, readiness_percent) source_control_ready_count = sum(1 for value in controls.values() if value) source_controls_present = ( bool(controls) and source_control_ready_count == len(controls) @@ -738,8 +864,12 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", "source_id": "reboot_auto_recovery_slo_scorecard", "status": str(payload.get("status") or "unknown"), + "current_phase": sop_progress["current_phase"], + "eta_or_wait_reason": sop_progress["eta_or_wait_reason"], "target_minutes": int_value(payload.get("target_minutes")), "safe_next_step": str(payload.get("safe_next_step") or ""), + "next_safe_action": sop_progress["next_safe_action"], + "primary_blocker": sop_progress["primary_blocker"], "active_blockers": active_blockers, "active_blocker_count": len(active_blockers), "readiness_percent": readiness_percent, @@ -755,6 +885,11 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: summary = { "reboot_auto_recovery_status": str(payload.get("status") or "unknown"), "reboot_auto_recovery_workplan_id": "P0-006", + "reboot_auto_recovery_current_phase": sop_progress["current_phase"], + "reboot_auto_recovery_eta_or_wait_reason": sop_progress[ + "eta_or_wait_reason" + ], + "reboot_auto_recovery_primary_blocker": sop_progress["primary_blocker"], "reboot_auto_recovery_readiness_percent": readiness_percent, "reboot_auto_recovery_active_blocker_count": len(active_blockers), "reboot_auto_recovery_can_claim_slo": ( @@ -775,6 +910,7 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: "stockplatform_ingestion_status" ], "reboot_auto_recovery_safe_next_step": readback["safe_next_step"], + "reboot_auto_recovery_next_safe_action": readback["next_safe_action"], "reboot_auto_recovery_source_controls_present": source_controls_present, "secret_values_collected": False, "github_api_used": False, @@ -783,6 +919,11 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: } payload["required_checks"] = required_checks + payload["current_phase"] = sop_progress["current_phase"] + payload["eta_or_wait_reason"] = sop_progress["eta_or_wait_reason"] + payload["primary_blocker"] = sop_progress["primary_blocker"] + payload["next_safe_action"] = sop_progress["next_safe_action"] + payload["reboot_sop_progress"] = sop_progress payload["readback"] = readback payload["rollups"] = rollups payload["summary"] = summary diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index 56677984..31042643 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -169,13 +169,27 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) - assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1" assert payload["status"] == "slo_ready" assert payload["can_claim_all_services_recovered_within_target"] is True + assert payload["current_phase"] == "slo_ready" + assert payload["eta_or_wait_reason"] == "recovered_within_10_minute_slo" + assert payload["primary_blocker"] == "" + assert payload["next_safe_action"] == payload["safe_next_step"] + assert payload["reboot_sop_progress"]["current_phase"] == "slo_ready" + assert payload["reboot_sop_progress"]["active_blocker_count"] == 0 assert payload["readback"]["workplan_id"] == "P0-006" + assert payload["readback"]["current_phase"] == "slo_ready" + assert payload["readback"]["eta_or_wait_reason"] == "recovered_within_10_minute_slo" + assert payload["readback"]["next_safe_action"] == payload["safe_next_step"] assert payload["readback"]["readiness_percent"] == 100 assert payload["readback"]["active_blocker_count"] == 0 assert payload["readback"]["runtime_write_authorized_by_this_scorecard"] is False assert payload["rollups"]["source_controls_present"] is True assert payload["rollups"]["readiness_percent"] == 100 assert payload["summary"]["reboot_auto_recovery_workplan_id"] == "P0-006" + assert payload["summary"]["reboot_auto_recovery_current_phase"] == "slo_ready" + assert ( + payload["summary"]["reboot_auto_recovery_eta_or_wait_reason"] + == "recovered_within_10_minute_slo" + ) assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is True assert payload["summary"]["runtime_write_authorized"] is False assert payload["source_controls"][ @@ -221,10 +235,17 @@ def test_services_green_but_old_boot_window_waits_for_reboot_event(tmp_path: Pat assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready" assert payload["active_blockers"] == ["host_boot_observation_older_than_target_window"] + assert payload["current_phase"] == "awaiting_next_reboot_or_approved_drill" + assert ( + payload["eta_or_wait_reason"] + == "target_window_remaining_450s_but_blocked_by_host_boot_observation_older_than_target_window" + ) + assert payload["primary_blocker"] == "host_boot_observation_older_than_target_window" assert payload["safe_next_step"] == ( "timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_" "event_or_approved_reboot_drill_to_prove_10_minute_slo" ) + assert payload["next_safe_action"] == payload["safe_next_step"] def test_stockplatform_blocked_before_final_retry_waits_for_readback(tmp_path: Path) -> None: