From 5b5ef7fe2db922d19feb9616d90f3598c4b6d42c Mon Sep 17 00:00:00 2001 From: ogt Date: Thu, 2 Jul 2026 18:32:52 +0800 Subject: [PATCH] fix(reboot): surface windows99 console channel readback --- .../awoooi_priority_work_order_readback.py | 36 ++++++++++ .../reboot_auto_recovery_slo_scorecard.py | 18 +++++ ...awoooi_priority_work_order_readback_api.py | 40 +++++++++++ ..._reboot_auto_recovery_slo_scorecard_api.py | 15 ++++ docs/LOGBOOK.md | 16 +++++ ...-auto-recovery-slo-scorecard.snapshot.json | 40 +++++++++-- docs/runbooks/FULL-STACK-COLD-START-SOP.md | 2 +- ...r-inserted-requirements-priority-ledger.md | 4 +- .../reboot-auto-recovery-slo-scorecard.py | 10 +++ ...test_reboot_auto_recovery_slo_scorecard.py | 14 ++++ ...test_windows99_management_channel_probe.py | 71 +++++++++++++++++++ .../windows99-management-channel-probe.py | 13 +++- 12 files changed, 269 insertions(+), 10 deletions(-) create mode 100644 scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py diff --git a/apps/api/src/services/awoooi_priority_work_order_readback.py b/apps/api/src/services/awoooi_priority_work_order_readback.py index 95fccc80..97089330 100644 --- a/apps/api/src/services/awoooi_priority_work_order_readback.py +++ b/apps/api/src/services/awoooi_priority_work_order_readback.py @@ -2680,6 +2680,15 @@ def _enrich_from_current_readbacks(payload: dict[str, Any]) -> None: state["windows99_rdp_console_reachable"] = bool( windows99_management.get("rdp_console_reachable") is True ) + state["windows99_hyperv_vmconnect_open"] = bool( + windows99_management.get("hyperv_vmconnect_open") is True + ) + state["windows99_local_console_channel_reachable"] = bool( + windows99_management.get("local_console_channel_reachable") is True + ) + state["windows99_console_collection_channels"] = _strings( + windows99_management.get("console_collection_channels") + ) state["windows99_winrm_http_open"] = bool( windows99_management.get("winrm_http_open") is True ) @@ -2873,6 +2882,15 @@ def _enrich_from_current_readbacks(payload: dict[str, Any]) -> None: evidence["windows99_rdp_console_reachable"] = state[ "windows99_rdp_console_reachable" ] + evidence["windows99_hyperv_vmconnect_open"] = state[ + "windows99_hyperv_vmconnect_open" + ] + evidence["windows99_local_console_channel_reachable"] = state[ + "windows99_local_console_channel_reachable" + ] + evidence["windows99_console_collection_channels"] = state[ + "windows99_console_collection_channels" + ] evidence["windows99_winrm_http_open"] = state["windows99_winrm_http_open"] evidence["windows99_winrm_https_open"] = state["windows99_winrm_https_open"] evidence["drill_preflight_status"] = str(reboot_preflight.get("status") or "") @@ -3151,6 +3169,15 @@ def _set_rollups_and_summary( "windows99_rdp_console_reachable": ( state.get("windows99_rdp_console_reachable") is True ), + "windows99_hyperv_vmconnect_open": ( + state.get("windows99_hyperv_vmconnect_open") is True + ), + "windows99_local_console_channel_reachable": ( + state.get("windows99_local_console_channel_reachable") is True + ), + "windows99_console_collection_channels": _strings( + state.get("windows99_console_collection_channels") + ), "p0_004_runtime_readback_ready": p0_004_ready, "reboot_drill_preflight_runtime_readback_ready": ( state.get("reboot_drill_preflight_runtime_readback_state") == "ready" @@ -3222,6 +3249,15 @@ def _set_rollups_and_summary( "windows99_rdp_console_reachable": ( state.get("windows99_rdp_console_reachable") is True ), + "windows99_hyperv_vmconnect_open": ( + state.get("windows99_hyperv_vmconnect_open") is True + ), + "windows99_local_console_channel_reachable": ( + state.get("windows99_local_console_channel_reachable") is True + ), + "windows99_console_collection_channels": _strings( + state.get("windows99_console_collection_channels") + ), "windows99_winrm_http_open": ( state.get("windows99_winrm_http_open") is True ), diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 4bae7248..bc9aec82 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -229,10 +229,19 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_winrm_https_open": ( windows99_management.get("winrm_https_open") is True ), + "windows99_hyperv_vmconnect_open": ( + windows99_management.get("hyperv_vmconnect_open") is True + ), "windows99_rdp_console_reachable": ( windows99_management.get("rdp_console_reachable") is True ), "runtime_readback_generated_at_present": True, + "windows99_local_console_channel_reachable": ( + windows99_management.get("local_console_channel_reachable") is True + ), + "windows99_console_collection_channels": _strings( + windows99_management.get("console_collection_channels") + ), } return { "schema_version": _API_SCHEMA_VERSION, @@ -280,6 +289,9 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_remote_execution_channel_ready" ], "windows99_ssh_batch_status": rollups["windows99_ssh_batch_status"], + "windows99_local_console_channel_reachable": rollups[ + "windows99_local_console_channel_reachable" + ], "readback": { "workplan_id": "P0-006", "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", @@ -322,6 +334,12 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_remote_execution_channel_ready" ], "windows99_ssh_batch_status": rollups["windows99_ssh_batch_status"], + "windows99_local_console_channel_reachable": rollups[ + "windows99_local_console_channel_reachable" + ], + "windows99_console_collection_channels": rollups[ + "windows99_console_collection_channels" + ], }, "reboot_sop_progress": sop_progress, "controlled_service_data_backup_readback": ( diff --git a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py index b5d4163d..50ee7354 100644 --- a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py +++ b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py @@ -146,6 +146,19 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): payload["mainline_execution_state"]["windows99_rdp_console_reachable"] is True ) + assert ( + payload["mainline_execution_state"]["windows99_hyperv_vmconnect_open"] + is True + ) + assert ( + payload["mainline_execution_state"][ + "windows99_local_console_channel_reachable" + ] + is True + ) + assert payload["mainline_execution_state"][ + "windows99_console_collection_channels" + ] == ["rdp_console", "hyperv_vmconnect"] assert payload["next_execution_order"][0].startswith("P0-006:") in_progress = payload["in_progress_or_blocked_in_priority_order"][0] assert in_progress["workplan_id"] == "P0-006" @@ -209,6 +222,15 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): "permission_denied" ) assert in_progress["evidence"]["windows99_rdp_console_reachable"] is True + assert in_progress["evidence"]["windows99_hyperv_vmconnect_open"] is True + assert ( + in_progress["evidence"]["windows99_local_console_channel_reachable"] + is True + ) + assert in_progress["evidence"]["windows99_console_collection_channels"] == [ + "rdp_console", + "hyperv_vmconnect", + ] assert ( in_progress["evidence"][ "controlled_service_data_backup_can_clear_blockers" @@ -285,12 +307,30 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): assert payload["rollups"]["windows99_management_readback_present"] is True assert payload["rollups"]["windows99_remote_execution_channel_ready"] is False assert payload["rollups"]["windows99_ssh_batch_status"] == "permission_denied" + assert payload["rollups"]["windows99_hyperv_vmconnect_open"] is True assert payload["rollups"]["windows99_rdp_console_reachable"] is True + assert ( + payload["rollups"]["windows99_local_console_channel_reachable"] + is True + ) + assert payload["rollups"]["windows99_console_collection_channels"] == [ + "rdp_console", + "hyperv_vmconnect", + ] assert payload["summary"]["windows99_management_readback_present"] is True assert payload["summary"]["windows99_management_host_reachable"] is True assert payload["summary"]["windows99_remote_execution_channel_ready"] is False assert payload["summary"]["windows99_ssh_batch_status"] == "permission_denied" + assert payload["summary"]["windows99_hyperv_vmconnect_open"] is True assert payload["summary"]["windows99_rdp_console_reachable"] is True + assert ( + payload["summary"]["windows99_local_console_channel_reachable"] + is True + ) + assert payload["summary"]["windows99_console_collection_channels"] == [ + "rdp_console", + "hyperv_vmconnect", + ] assert payload["operation_boundaries"]["github_api_used"] is False assert payload["operation_boundaries"]["github_cli_used"] is False assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 1847a149..5af61a8e 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -200,7 +200,16 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["rollups"]["windows99_remote_execution_channel_ready"] is False assert payload["rollups"]["windows99_ssh_batch_status"] == "permission_denied" assert payload["rollups"]["runtime_readback_generated_at_present"] is True + assert payload["rollups"]["windows99_hyperv_vmconnect_open"] is True assert payload["rollups"]["windows99_rdp_console_reachable"] is True + assert ( + payload["rollups"]["windows99_local_console_channel_reachable"] + is True + ) + assert payload["rollups"]["windows99_console_collection_channels"] == [ + "rdp_console", + "hyperv_vmconnect", + ] assert payload["rollups"]["stockplatform_final_retry_window_passed"] is False assert ( payload["rollups"]["stockplatform_controlled_recovery_gate_required"] @@ -291,7 +300,13 @@ def _assert_reboot_slo_payload(payload: dict): assert windows99_management["host_reachable"] is True assert windows99_management["remote_execution_channel_ready"] is False assert windows99_management["ssh_batch"]["status"] == "permission_denied" + assert windows99_management["hyperv_vmconnect_open"] is True assert windows99_management["rdp_console_reachable"] is True + assert windows99_management["local_console_channel_reachable"] is True + assert windows99_management["console_collection_channels"] == [ + "rdp_console", + "hyperv_vmconnect", + ] stockplatform = payload["stockplatform_data_freshness"] assert stockplatform["freshness_endpoint_readback_present"] is True assert stockplatform["ingestion_endpoint_readback_present"] is True diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e8850cc8..e6392195 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -28,6 +28,22 @@ - 未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth。 - 未送 Telegram、未呼叫 Bot API、未寫 Gateway queue、未改 receiver route、未 workflow_dispatch、未寫 production DB、未重啟主機 / VM / Docker / Nginx / K3s / DB / firewall。 +## 2026-07-02 — 18:28 P0-006 Windows99 local console channel readback 產品化 + +**完成內容**: +- Gitea CD `#4406` 已 Success;deploy marker `dc4547b1 chore(cd): deploy add790a [skip ci]`,production readback 對 `add790a47f8cb9d417f91ae3fba19e94153f0400`。 +- `scripts/reboot-recovery/windows99-management-channel-probe.py` 將 TCP `2179` 納入 no-secret management probe,並輸出 `hyperv_vmconnect_open`、`local_console_channel_reachable` 與 `console_collection_channels`,讓 local console Verify lane 不再只停在文字 next step。 +- live no-secret probe 讀回:`22=open`、`135=open`、`445=open`、`2179=open`、`3389=open`、`5985/5986=timeout`、`ssh_batch.status=permission_denied`、`remote_execution_channel_ready=false`、`console_collection_channels=["rdp_console","hyperv_vmconnect"]`。 +- `reboot-auto-recovery-slo-scorecard.py`、`/api/v1/agents/reboot-auto-recovery-slo-scorecard` 與 `awoooi-priority-work-order-readback` 同步上卷 `windows99_local_console_channel_reachable` / `windows99_console_collection_channels`,並把 committed snapshot 更新為 18:28 live readback。 +- 這不代表 99 VMware autostart / Windows Update policy 已驗證;P0-006 仍 fail-closed,因為 live `windows99-vmware-autostart.ps1 -Mode Verify` stdout 尚未收回,remote execution channel 也仍不可用。 + +**驗證**: +- live collector:`WINDOWS99_MAX_AUTH_USERS=5 bash scripts/reboot-recovery/collect-windows99-vmware-verify.sh --check` 回 `ssh_batchmode_auth_ready=0`、`remote_verify_attempted=0`、`verify_collection_status=blocked_ssh_publickey_auth_missing`。 +- live management probe:`python3.11 scripts/reboot-recovery/windows99-management-channel-probe.py --ssh-timeout 3 --tcp-timeout 2 --output /tmp/awoooi-windows99-management-channel-20260702-console.json`。 + +**仍維持**: +- 未讀 Windows 密碼 / secret / token / `.env` / raw sessions / SQLite / auth;未執行 Windows remote command;未重啟主機 / VM / Docker / Nginx / K3s / DB / firewall;未啟動 VM;未使用 GitHub / `gh` / GitHub API。 + ## 2026-07-02 — 15:42 P0-006 Windows99 verify collector 留在 controlled-runtime CD lane **完成內容**: diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index d90e47c1..a83b67b5 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -227,7 +227,12 @@ "windows99_vmware_verify_ready": false, "workflow_trigger_authorized_by_this_scorecard": false, "workplan_id": "P0-006", - "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO" + "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", + "windows99_local_console_channel_reachable": true, + "windows99_console_collection_channels": [ + "rdp_console", + "hyperv_vmconnect" + ] }, "reboot_event_detection": { "all_required_hosts_in_reboot_window": false, @@ -348,7 +353,13 @@ "windows99_vmware_readback_present": false, "windows99_vmware_verify_ready": false, "windows99_winrm_http_open": false, - "windows99_winrm_https_open": false + "windows99_winrm_https_open": false, + "windows99_hyperv_vmconnect_open": true, + "windows99_local_console_channel_reachable": true, + "windows99_console_collection_channels": [ + "rdp_console", + "hyperv_vmconnect" + ] }, "safe_next_step": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1", @@ -542,7 +553,12 @@ "reboot_auto_recovery_workplan_id": "P0-006", "runtime_write_authorized": false, "secret_values_collected": false, - "workflow_trigger_performed": false + "workflow_trigger_performed": false, + "reboot_auto_recovery_windows99_local_console_channel_reachable": true, + "reboot_auto_recovery_windows99_console_collection_channels": [ + "rdp_console", + "hyperv_vmconnect" + ] }, "target_minutes": 10, "target_seconds": 600, @@ -553,6 +569,10 @@ "windows99_ssh_batch_denied" ], "can_collect_vmware_verify_without_secret": false, + "console_collection_channels": [ + "rdp_console", + "hyperv_vmconnect" + ], "forbidden_actions": [ "read_windows_password", "read_secret_value", @@ -561,11 +581,17 @@ "restart_service", "write_windows_policy" ], - "generated_at": "2026-07-02T15:08:44+08:00", + "generated_at": "2026-07-02T18:28:56+08:00", "host": "192.168.0.99", "host_reachable": true, + "hyperv_vmconnect_open": true, + "local_console_channel_reachable": true, + "ping": { + "checked": true, + "ok": true, + "status": "ok" + }, "rdp_console_reachable": true, - "readback_present": true, "remote_execution_channel_ready": false, "schema_version": "windows99_management_channel_readback_v1", "ssh_batch": { @@ -576,6 +602,7 @@ "ssh_user": "administrator", "tcp_ports": { "135": "open", + "2179": "open", "22": "open", "3389": "open", "445": "open", @@ -583,7 +610,8 @@ "5986": "timeout" }, "winrm_http_open": false, - "winrm_https_open": false + "winrm_https_open": false, + "readback_present": true }, "windows99_verify_collection": { "can_collect_no_secret_verify": true, diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 526d7dd9..a3d3780f 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -22,7 +22,7 @@ v1.93 reboot SLA readback fixed fields rule:`scripts/reboot-recovery/reboot-au v1.94 Windows 99 / VMware verifier rule:99 主機與 guest VM 自動啟動不得再只靠「ping 到 99」或「VMX source 存在」宣稱。`scripts/reboot-recovery/windows99-vmware-autostart.ps1 -Mode Verify` 必須輸出 no-secret 固定欄位:`VMRUN_PRESENT`、`VMX alias=... present=...`、`VMWARE_SERVICE ... ok=...`、`VMWARE_AUTOSTART_TASK ... ok=...`、`WINDOWS_UPDATE_POLICY ... ok=...`、`VM_POWER alias=... running=...`、`VMWARE_AUTOSTART_CONFIG_READY`、`VMWARE_AUTOSTART_POWER_READY`、`WINDOWS_UPDATE_NO_AUTO_REBOOT_READY`、`VMWARE_AUTOSTART_VERIFY_READY`。`reboot-auto-recovery-slo-scorecard.py --windows99-vmware-file ` 必須解析這些欄位;缺 readback 時 active blocker 固定為 `windows99_vmware_autostart_readback_missing`,next action 固定為 `collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot`。預設 required guest VM alias 為 `111 / 188 / 120 / 121 / 112`;`110` 不再被 99 VMware autostart 預設清單替代。此 verifier 不讀 Windows 密碼、不讀 secret、不啟動 VM、不重啟 host;`Apply` 仍需獨立 controlled apply 與 post-verifier。 -v1.95 Windows 99 management-channel readback rule:若 99 可 ping / RDP / TCP,但 `windows99-vmware-autostart.ps1 -Mode Verify` 尚未收回,不得只寫成「等 verifier」。必須先跑 `scripts/reboot-recovery/windows99-management-channel-probe.py --output `,並把結果用 `reboot-auto-recovery-slo-scorecard.py --windows99-management-file ` 接入同一份 SLO scorecard。此 probe 只做 no-secret readback:TCP `22 / 135 / 445 / 3389 / 5985 / 5986`、SSH BatchMode publickey、WinRM port、RDP console reachability;不得讀 Windows 密碼、不得啟動 VM、不得重啟、不得改 Windows Update。2026-07-02 15:08 live readback:`host_reachable=true`、`rdp_console_reachable=true`、`ssh_batch.status=permission_denied`、`winrm_http_open=false`、`winrm_https_open=false`、`remote_execution_channel_ready=false`;因此 active blockers 必須包含 `windows99_remote_execution_channel_unavailable` 與 `windows99_vmware_autostart_readback_missing`,next action 固定為 `restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot`。 +v1.95 Windows 99 management-channel readback rule:若 99 可 ping / RDP / TCP,但 `windows99-vmware-autostart.ps1 -Mode Verify` 尚未收回,不得只寫成「等 verifier」。必須先跑 `scripts/reboot-recovery/windows99-management-channel-probe.py --output `,並把結果用 `reboot-auto-recovery-slo-scorecard.py --windows99-management-file ` 接入同一份 SLO scorecard。此 probe 只做 no-secret readback:TCP `22 / 135 / 445 / 2179 / 3389 / 5985 / 5986`、SSH BatchMode publickey、WinRM port、RDP console reachability 與 VMConnect / local console reachability;不得讀 Windows 密碼、不得啟動 VM、不得重啟、不得改 Windows Update。2026-07-02 18:28 live readback:`host_reachable=true`、`rdp_console_reachable=true`、`hyperv_vmconnect_open=true`、`local_console_channel_reachable=true`、`console_collection_channels=["rdp_console","hyperv_vmconnect"]`、`ssh_batch.status=permission_denied`、`winrm_http_open=false`、`winrm_https_open=false`、`remote_execution_channel_ready=false`;因此 active blockers 必須包含 `windows99_remote_execution_channel_unavailable` 與 `windows99_vmware_autostart_readback_missing`,next action 固定為 `restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot`。 2026-07-02 110 control-path / Harbor recovery receipt rule:若 Gitea Harbor repair queue 仍保留 `harbor_110_remote_ssh_publickey_auth_stalled`、remote-control unavailable、jobs stale 或 historical failure,但同一輪本地證據同時證明 `wooo` command path ready、110 local Harbor `/v2/` ready、public/internal registry `/v2/` 回 `401`,則該 Gitea Harbor repair 失敗只能列為 historical queue metadata,不得再當成 current SSH blocker。必須用 `/api/v1/agents/harbor-registry-controlled-recovery-receipt` 或同等 validator 合併 `diagnose-110-ssh-publickey-auth.sh`、`recover-110-control-path-and-harbor-local.sh --check`、public Gitea queue readback 與 registry `/v2/` verifier,並把機器可讀結果寫入 `docs/operations/harbor-110-control-path-recovery-readback-2026-07-02.snapshot.json` 類型的 snapshot。2026-07-02 live receipt 顯示:public/internal registry `/v2/` 均為 `401`、latest visible CD `#4335` 為 `Success`、Gitea Harbor repair failure 已是 `historical_after_latest_cd_success=true`;active blockers 收斂為 110 controlled CD lane config / binary / registration / service guardrail、active action container pressure,以及 Gitea CD jobs head-SHA / stale readback mismatch。若 local-console output 只有 `AWOOOI_110_CONTROLLED_CD_LANE_READY` marker,non110 runner parser 不得從 110 `BLOCKER` 行推導 non110 blocker;non110 只有看到 `AWOOOI_NON110_RUNNER_READY` marker 才能列入 active blocker。 diff --git a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md index 55c6e54a..aab0ec8f 100644 --- a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md +++ b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md @@ -57,9 +57,9 @@ | 順序 | ID | 優先序 | 使用者插入要求 | 正規化工作項 | 目前狀態 | 下一個可驗證動作 | | --- | --- | --- | --- | --- | --- | --- | -| 1 | CIR-P0-RBT-001 | P0 | 「主機重啟後 10 分鐘內全部恢復,且要自動判斷所有主機被重啟」 | 建立 99/110/111/112/120/121/188 reboot event detector + 10 分鐘 SLO scorecard + fixed triage order | 2026-07-02 15:08 live scorecard 已更新:readiness `43%`、active blockers `11`;`windows99_verify_collection` 與 `windows99_management_channel` 已進 API / scorecard;仍缺 fresh all-host 10 分鐘證明,111 不可達,99 uptime / VMware verifier 未閉環 | 優先收斂 99 no-secret management channel / verifier readback 與 111 reachability;不可宣稱 10 分鐘 SLA 已證明 | +| 1 | CIR-P0-RBT-001 | P0 | 「主機重啟後 10 分鐘內全部恢復,且要自動判斷所有主機被重啟」 | 建立 99/110/111/112/120/121/188 reboot event detector + 10 分鐘 SLO scorecard + fixed triage order | 2026-07-02 18:28 live scorecard 已更新:readiness `43%`、active blockers `11`;`windows99_verify_collection`、`windows99_management_channel` 與 `windows99_local_console_channel_reachable` 已進 API / scorecard;仍缺 fresh all-host 10 分鐘證明,111 不可達,99 uptime / VMware verifier 未閉環 | 優先收斂 99 local console Verify output / no-secret management channel 與 111 reachability;不可宣稱 10 分鐘 SLA 已證明 | | 2 | CIR-P0-RBT-002 | P0 | 「沒有偵測到主機重啟」 | 修正 host reboot/shutdown/up detection:boot_id / uptime / node exporter / Windows exporter / VMware VM power state 都要進同一事件 | Scorecard 已接 collection packet + management probe;99 host reachable 但 uptime unknown,111 unreachable,stale hosts 仍存在 | 讓 99 verifier / Windows exporter 或等效 no-secret readback 進入 host boot event,並補 111 reachability 證據 | -| 3 | CIR-P0-RBT-003 | P0 | 「192.168.0.99 VMWare 要自動啟動,裡面 111/188/120/121/112 也自動啟動」 | Windows 99 VMware host autostart + guest VM autostart contract;VM host 111/188/120/121/112 開機順序與 readback | Source verifier / parser / API readback / collection packet 已完成;management probe 讀回 `host_reachable=true`、RDP open、SSH BatchMode `permission_denied`、WinRM timeout;snapshot active blockers=`windows99_remote_execution_channel_unavailable`、`windows99_vmware_autostart_readback_missing` | 恢復 no-secret management channel 或收集 local console Verify output,再確認 `VMRUN_PRESENT`、scheduled task、VMware services、VM power、VMX present 全綠 | +| 3 | CIR-P0-RBT-003 | P0 | 「192.168.0.99 VMWare 要自動啟動,裡面 111/188/120/121/112 也自動啟動」 | Windows 99 VMware host autostart + guest VM autostart contract;VM host 111/188/120/121/112 開機順序與 readback | Source verifier / parser / API readback / collection packet 已完成;management probe 讀回 `host_reachable=true`、RDP open、`2179` VMConnect / console channel open、SSH BatchMode `permission_denied`、WinRM timeout;snapshot active blockers=`windows99_remote_execution_channel_unavailable`、`windows99_vmware_autostart_readback_missing` | 收集 local console Verify output 或恢復 no-secret management channel,再確認 `VMRUN_PRESENT`、scheduled task、VMware services、VM power、VMX present 全綠 | | 4 | CIR-P0-RBT-004 | P0 | 「192.168.0.99 不可因 Windows Update 無預警重開」 | Windows Update reboot policy:active hours / no auto-restart / maintenance window / update notification audit | Source verifier 已補 `WINDOWS_UPDATE_POLICY` 與 `WINDOWS_UPDATE_NO_AUTO_REBOOT_READY`;collection packet 已列 forbidden actions;99 management channel 尚不能收 policy readback | 取得 Verify output;若 policy 不綠,再走 controlled apply,禁止要求或記錄 Windows 密碼 | | 5 | CIR-P0-RBT-005 | P0 | 「網站重啟後 502 嚴重影響體驗,要維護頁,外部雲端或專業做法」 | Public maintenance fallback:Nginx / edge / external static maintenance page / status page / fail-open UX,避免 502 直出 | 尚未完整落地;目前是需求缺口 | 產生 `public_maintenance_fallback` decision record:DNS/edge/外部雲端/本地 Nginx fallback 風險比較,先做不切流量的 check-mode | | 6 | CIR-P0-RBT-006 | P0 | 「所有主機關機立刻 Telegram 告警,重啟後也要告警,其他告警一併完整思考」 | Down / shutdown suspected / reboot detected / reboot recovered / SLO missed / backup failed / freshness stale / CPU pressure / Gitea queue 告警矩陣 | 部分已有 Alertmanager rule 與 Telegram receipt 補強;仍缺完整 shutdown/up E2E receipt | 建立 Telegram alert matrix + receipt verifier,逐項讀回 Alertmanager active/resolved 與 outbound receipt,不送測試 secret | diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index 7b8613bf..bf8ccb25 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -274,7 +274,10 @@ def parse_windows99_management_readback(path: Path | None) -> dict[str, Any]: "ssh_batch": {"checked": False, "ready": False, "status": "missing"}, "winrm_http_open": False, "winrm_https_open": False, + "hyperv_vmconnect_open": False, "rdp_console_reachable": False, + "local_console_channel_reachable": False, + "console_collection_channels": [], "remote_execution_channel_ready": False, "can_collect_vmware_verify_without_secret": False, "blockers": [], @@ -313,7 +316,14 @@ def parse_windows99_management_readback(path: Path | None) -> dict[str, Any]: }, "winrm_http_open": payload.get("winrm_http_open") is True, "winrm_https_open": payload.get("winrm_https_open") is True, + "hyperv_vmconnect_open": payload.get("hyperv_vmconnect_open") is True, "rdp_console_reachable": payload.get("rdp_console_reachable") is True, + "local_console_channel_reachable": ( + payload.get("local_console_channel_reachable") is True + ), + "console_collection_channels": strings( + payload.get("console_collection_channels") + ), "remote_execution_channel_ready": ( payload.get("remote_execution_channel_ready") is True ), diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index 0417925a..8f618ea1 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -100,6 +100,7 @@ WINDOWS99_MANAGEMENT_BLOCKED = { "22": "open", "135": "open", "445": "open", + "2179": "open", "3389": "open", "5985": "timeout", "5986": "timeout", @@ -112,7 +113,10 @@ WINDOWS99_MANAGEMENT_BLOCKED = { }, "winrm_http_open": False, "winrm_https_open": False, + "hyperv_vmconnect_open": True, "rdp_console_reachable": True, + "local_console_channel_reachable": True, + "console_collection_channels": ["rdp_console", "hyperv_vmconnect"], "remote_execution_channel_ready": False, "can_collect_vmware_verify_without_secret": False, "blockers": [ @@ -413,7 +417,17 @@ def test_windows99_management_channel_unavailable_is_visible(tmp_path: Path) -> assert payload["windows99_management_channel"]["ssh_batch"]["status"] == ( "permission_denied" ) + assert payload["windows99_management_channel"]["hyperv_vmconnect_open"] is True assert payload["windows99_management_channel"]["rdp_console_reachable"] is True + assert ( + payload["windows99_management_channel"][ + "local_console_channel_reachable" + ] + is True + ) + assert payload["windows99_management_channel"][ + "console_collection_channels" + ] == ["rdp_console", "hyperv_vmconnect"] assert payload["rollups"]["windows99_management_readback_present"] is True assert payload["rollups"]["windows99_host_reachable"] is True assert payload["rollups"]["windows99_remote_execution_channel_ready"] is False diff --git a/scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py b/scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py new file mode 100644 index 00000000..41d8eaa3 --- /dev/null +++ b/scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import argparse +import importlib.util +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +SCRIPT = ROOT / "scripts" / "reboot-recovery" / "windows99-management-channel-probe.py" + + +def _load_module(): + spec = importlib.util.spec_from_file_location( + "windows99_management_channel_probe", + SCRIPT, + ) + assert spec is not None + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def test_management_probe_surfaces_no_secret_console_channels(monkeypatch): + module = _load_module() + + def fake_tcp_status(_host: str, port: int, _timeout: float) -> str: + return "open" if port in {22, 2179, 3389} else "timeout" + + monkeypatch.setattr(module, "tcp_status", fake_tcp_status) + monkeypatch.setattr( + module, + "ping_status", + lambda _host: {"checked": True, "ok": False, "status": "failed"}, + ) + monkeypatch.setattr( + module, + "ssh_batch_status", + lambda *_args, **_kwargs: { + "checked": True, + "ready": False, + "status": "permission_denied", + }, + ) + + payload = module.build_payload( + argparse.Namespace( + host="192.168.0.99", + ssh_user="administrator", + tcp_timeout=0.01, + ssh_timeout=1, + ports=None, + skip_ssh=False, + generated_at="2026-07-02T16:00:00+08:00", + output=None, + ) + ) + + assert "2179" in payload["tcp_ports"] + assert payload["readback_present"] is True + assert payload["hyperv_vmconnect_open"] is True + assert payload["rdp_console_reachable"] is True + assert payload["local_console_channel_reachable"] is True + assert payload["console_collection_channels"] == [ + "rdp_console", + "hyperv_vmconnect", + ] + assert payload["remote_execution_channel_ready"] is False + assert payload["can_collect_vmware_verify_without_secret"] is False + assert "windows99_remote_execution_channel_unavailable" in payload["blockers"] + assert "read_windows_password" in payload["forbidden_actions"] diff --git a/scripts/reboot-recovery/windows99-management-channel-probe.py b/scripts/reboot-recovery/windows99-management-channel-probe.py index 0c81bff6..3550faa6 100644 --- a/scripts/reboot-recovery/windows99-management-channel-probe.py +++ b/scripts/reboot-recovery/windows99-management-channel-probe.py @@ -20,7 +20,7 @@ from typing import Any SCHEMA_VERSION = "windows99_management_channel_readback_v1" -DEFAULT_PORTS = (22, 135, 445, 3389, 5985, 5986) +DEFAULT_PORTS = (22, 135, 445, 2179, 3389, 5985, 5986) def parse_args() -> argparse.Namespace: @@ -157,7 +157,14 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: host_reachable = ping["ok"] or any(status == "open" for status in tcp_ports.values()) winrm_http_open = tcp_ports.get("5985") == "open" winrm_https_open = tcp_ports.get("5986") == "open" + hyperv_vmconnect_open = tcp_ports.get("2179") == "open" rdp_console_reachable = tcp_ports.get("3389") == "open" + console_collection_channels = [] + if rdp_console_reachable: + console_collection_channels.append("rdp_console") + if hyperv_vmconnect_open: + console_collection_channels.append("hyperv_vmconnect") + local_console_channel_reachable = bool(console_collection_channels) ssh_probe = ( {"checked": False, "ready": False, "status": "skipped"} if args.skip_ssh @@ -181,6 +188,7 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: return { "schema_version": SCHEMA_VERSION, + "readback_present": True, "generated_at": generated_at, "host": args.host, "ping": ping, @@ -190,7 +198,10 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: "ssh_batch": ssh_probe, "winrm_http_open": winrm_http_open, "winrm_https_open": winrm_https_open, + "hyperv_vmconnect_open": hyperv_vmconnect_open, "rdp_console_reachable": rdp_console_reachable, + "local_console_channel_reachable": local_console_channel_reachable, + "console_collection_channels": console_collection_channels, "remote_execution_channel_ready": remote_execution_ready, "can_collect_vmware_verify_without_secret": remote_execution_ready, "blockers": blockers,