From e28ebd5b3ead0806d8560822c8a025a45c720ffd Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 3 Jul 2026 09:25:20 +0800 Subject: [PATCH] fix(reboot): surface windows99 console artifact blockers --- .../reboot_auto_recovery_slo_scorecard.py | 58 +++++++++++++++ ..._reboot_auto_recovery_slo_scorecard_api.py | 59 +++++++++++++++ docs/LOGBOOK.md | 14 ++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 4 +- ...oot-cold-start-backup-recovery-workplan.md | 8 +- ...r-inserted-requirements-priority-ledger.md | 8 +- .../reboot-auto-recovery-slo-scorecard.py | 74 +++++++++++++++++++ ...test_reboot_auto_recovery_slo_scorecard.py | 50 +++++++++++++ ...test_windows99_management_channel_probe.py | 55 ++++++++++++++ .../windows99-management-channel-probe.py | 53 +++++++++++++ 10 files changed, 374 insertions(+), 9 deletions(-) diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 0a663cf3a..5a2b16e7e 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -1471,6 +1471,7 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: active_blockers = _unique_strings( _strings(scorecard.get("active_blockers")) + _strings(public_maintenance.get("blockers")) + + _strings(windows99_management.get("console_artifact_blockers")) ) required_checks = { "source_controls_present": all(source_controls.values()), @@ -1701,6 +1702,18 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_available_collection_channels": _strings( windows99_verify_collection.get("available_collection_channels") ), + "windows99_console_artifact_status": str( + windows99_verify_collection.get("console_artifact_status") or "unknown" + ), + "windows99_console_artifact_reliable": ( + windows99_verify_collection.get("console_artifact_reliable") is True + ), + "windows99_console_artifact_blockers": _strings( + windows99_verify_collection.get("console_artifact_blockers") + ), + "windows99_console_artifact_safe_next_step": str( + windows99_verify_collection.get("console_artifact_safe_next_step") or "" + ), "windows99_host99_reachable": ( windows99_verify_collection["host99_reachable"] is True ), @@ -1864,6 +1877,18 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_available_collection_channels": rollups[ "windows99_available_collection_channels" ], + "windows99_console_artifact_status": rollups[ + "windows99_console_artifact_status" + ], + "windows99_console_artifact_reliable": rollups[ + "windows99_console_artifact_reliable" + ], + "windows99_console_artifact_blockers": rollups[ + "windows99_console_artifact_blockers" + ], + "windows99_console_artifact_safe_next_step": rollups[ + "windows99_console_artifact_safe_next_step" + ], "windows99_remote_execution_channel_ready": rollups[ "windows99_remote_execution_channel_ready" ], @@ -2089,6 +2114,15 @@ def _build_windows99_verify_collection_packet( windows99_management.get("local_console_channel_reachable") is True ) console_channels = _strings(windows99_management.get("console_collection_channels")) + console_artifact_status = str( + windows99_management.get("console_artifact_status") or "unknown" + ) + console_artifact_reliable = ( + windows99_management.get("console_artifact_reliable") is True + ) + console_artifact_blockers = _strings( + windows99_management.get("console_artifact_blockers") + ) collector_present = windows99_collector.get("readback_present") is True collector_status = str(windows99_collector.get("status") or "unknown") collector_ssh_ready = ( @@ -2106,6 +2140,9 @@ def _build_windows99_verify_collection_packet( for blocker in _strings(windows99_collector.get("blockers")): if blocker not in collection_blockers: collection_blockers.append(blocker) + for blocker in console_artifact_blockers: + if blocker not in collection_blockers: + collection_blockers.append(blocker) if not host99_uptime_known: collection_blockers.append("windows99_uptime_unknown") available_channels: list[str] = [] @@ -2144,6 +2181,12 @@ def _build_windows99_verify_collection_packet( ) ), "available_collection_channels": available_channels, + "console_artifact_status": console_artifact_status, + "console_artifact_reliable": console_artifact_reliable, + "console_artifact_blockers": console_artifact_blockers, + "console_artifact_safe_next_step": str( + windows99_management.get("console_artifact_safe_next_step") or "" + ), "no_secret_collector_readback_present": collector_present, "no_secret_collector_status": collector_status, "no_secret_collector_safe_next_step": str( @@ -2278,6 +2321,9 @@ def _reboot_sop_current_phase(active_blockers: list[str], can_claim_slo: bool) - "windows99_vmware_autostart_config_not_ready", "windows99_vmware_guest_power_not_ready", "windows99_update_no_auto_reboot_policy_not_ready", + "windows99_console_clipboard_unreliable", + "windows99_console_focus_unreliable", + "windows99_console_verify_output_truncated", } if any(blocker in host_boot_blockers for blocker in active_blockers): return "host_boot_detection_blocked" @@ -2329,6 +2375,9 @@ def _reboot_sop_primary_blocker(active_blockers: list[str]) -> str: "windows99_vmware_autostart_config_not_ready", "windows99_vmware_guest_power_not_ready", "windows99_update_no_auto_reboot_policy_not_ready", + "windows99_console_clipboard_unreliable", + "windows99_console_focus_unreliable", + "windows99_console_verify_output_truncated", "public_route_raw_5xx_without_maintenance_fallback", "public_route_unreachable_without_external_l1_fallback", "public_maintenance_fallback_runtime_readback_missing", @@ -2450,6 +2499,15 @@ def _active_blocker_action_row( "restore_windows99_no_secret_management_channel_or_collect_local_" "console_verify_readback_then_rerun_reboot_scorecard_no_reboot" ) + elif blocker in { + "windows99_console_clipboard_unreliable", + "windows99_console_focus_unreliable", + "windows99_console_verify_output_truncated", + }: + next_safe_action = ( + "stop_unreliable_rdp_clipboard_path_and_use_authorized_no_secret_" + "management_channel_or_validated_console_stdout_artifact" + ) post_verifier = ( "bash scripts/reboot-recovery/collect-windows99-vmware-verify.sh " "--check && rerun_reboot_auto_recovery_slo_scorecard" diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index fe01cb291..080b1f9b2 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -544,6 +544,65 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics( ) +def test_reboot_auto_recovery_slo_scorecard_surfaces_windows99_console_artifact_blocker( + tmp_path, +): + runtime_scorecard = json.loads(_SOURCE_SCORECARD.read_text(encoding="utf-8")) + runtime_scorecard["windows99_management_channel"] = { + **runtime_scorecard.get("windows99_management_channel", {}), + "readback_present": True, + "host_reachable": True, + "remote_execution_channel_ready": False, + "console_artifact_status": "blocked_clipboard_unreliable", + "console_artifact_reliable": False, + "console_artifact_blockers": [ + "windows99_console_clipboard_unreliable", + ], + "console_artifact_safe_next_step": ( + "use_authorized_no_secret_management_channel_or_manual_console_" + "stdout_capture" + ), + } + runtime_path = tmp_path / "scorecard.json" + runtime_path.write_text(json.dumps(runtime_scorecard), encoding="utf-8") + + payload = load_latest_reboot_auto_recovery_slo_scorecard( + runtime_scorecard_path=runtime_path + ) + + assert "windows99_console_clipboard_unreliable" in payload["active_blockers"] + collection = payload["windows99_verify_collection"] + assert collection["console_artifact_status"] == "blocked_clipboard_unreliable" + assert collection["console_artifact_reliable"] is False + assert collection["console_artifact_blockers"] == [ + "windows99_console_clipboard_unreliable" + ] + assert "windows99_console_clipboard_unreliable" in collection[ + "collection_blockers" + ] + assert payload["rollups"]["windows99_console_artifact_status"] == ( + "blocked_clipboard_unreliable" + ) + assert payload["rollups"]["windows99_console_artifact_reliable"] is False + assert payload["readback"]["windows99_console_artifact_blockers"] == [ + "windows99_console_clipboard_unreliable" + ] + action_by_blocker = { + item["blocker"]: item + for item in payload["active_blocker_action_matrix"]["items"] + } + console_action = action_by_blocker["windows99_console_clipboard_unreliable"] + assert console_action["category"] == "windows99_vmware_autostart" + assert ( + console_action["owner_lane"] + == "windows99_console_or_no_secret_management_channel" + ) + assert console_action["next_safe_action"] == ( + "stop_unreliable_rdp_clipboard_path_and_use_authorized_no_secret_" + "management_channel_or_validated_console_stdout_artifact" + ) + + def test_reboot_auto_recovery_slo_scorecard_keeps_service_backup_ready_after_live_stockplatform_ok(): payload = load_latest_reboot_auto_recovery_slo_scorecard( prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index dda1805e8..93ae2f2a4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,17 @@ +## 2026-07-03 — 09:22 Windows99 console artifact blocker 機器可讀化 + +**完成內容**: +- `windows99-management-channel-probe.py` 新增 `console_artifact_status`、`console_artifact_reliable`、`console_artifact_blockers` 與 `console_artifact_safe_next_step`,把 RDP / VMConnect 可見但 clipboard / focus / stdout 不可信的狀態轉成明確 blocker。 +- `reboot-auto-recovery-slo-scorecard.py` 與 production API loader 同步上卷 `windows99_console_clipboard_unreliable` / `windows99_console_focus_unreliable` / `windows99_console_verify_output_truncated` 到 `active_blockers`、`windows99_verify_collection.collection_blockers`、readback、rollups 與 `active_blocker_action_matrix`。 +- SOP 升到 v1.108,並同步 P0 workplan / 統帥插入需求台帳:RDP 畫面片段、截斷 stdout、Windows 端舊剪貼簿內容都不得當作 `windows99-vmware-autostart.ps1 -Mode Verify` 完成證據。 + +**已跑驗證**: +- `python3.11 -m py_compile scripts/reboot-recovery/windows99-management-channel-probe.py scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py apps/api/src/services/reboot_auto_recovery_slo_scorecard.py`:通過。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost/test PYTHONPATH=apps/api python3.11 -m pytest scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py -q -p no:cacheprovider`:`35 passed`。 + +**仍維持**: +- 本輪只做 source/API contract、測試與 SOP 沉澱;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / gh;未 workflow_dispatch;未啟動或關閉 VM;未重啟 host / service;未 Docker / Nginx / K3s / DB / firewall restart;未 DROP / TRUNCATE / restore / prune / delete / force push。 + ## 2026-07-03 — 09:05 Gitea bundle backup / dev-prod repo truth 交叉驗證 **完成內容**: diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 066c171fd..85e82ac12 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.107 +> Version: v1.108 > Last updated: 2026-07-03 Asia/Taipei > Scope: 99 / 110 / 111 / 112 / 120 / 121 / 188 全棧重啟恢復。112 仍是 Kali / VM guest 訊號,但 2026-06-30 全主機重啟後已納入 10 分鐘 SLO 的必要 boot / power signal;此納入不代表授權任何破壞性 runtime apply。 @@ -46,6 +46,8 @@ v1.105 public maintenance edge source-to-live drift rule:Gitea CD `#4519` 已 v1.107 2026-07-03 08:23 reboot verify-only readback rule:production `/api/v1/agents/reboot-auto-recovery-slo-scorecard` 仍回 `status=blocked_reboot_auto_recovery_slo_not_ready`、`active_blocker_count=8`、`readiness_percent=67`、`primary_blocker=reboot_event_required_host_unreachable`,active blockers 固定為 `all_required_hosts_not_in_10_minute_reboot_window`、`fresh_all_host_reboot_event_missing`、`host_boot_observation_older_than_target_window`、`host_unreachable_after_reboot`、`host_uptime_unknown`、`reboot_event_required_host_unreachable`、`windows99_vmware_guest_power_not_ready`、`windows99_vmware_vmx_missing`。同輪 no-write artifact `/tmp/awoooi-reboot-verify-only-20260703-082310` 證明:99 ping/RDP/Hyper-V VMConnect 可達但 uptime unknown,SSH BatchMode / WinRM 不可用;111 仍 unreachable;112/120/121 可達但 uptime 已超過 10 分鐘目標窗口;188 可達但 `systemd_state=degraded`、`awoooi-startup.service failed`。因此不得宣稱 10 分鐘 SLO 已完成,也不得把 RDP 可見或 source verifier 存在當作 VMware autostart ready。下一步固定為 `restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot`,並同時保持 `rerun_reboot_event_detector_and_host_probe_verify_only_no_reboot`;禁止讀 Windows 密碼、啟動 / 關閉 VM、host reboot、service restart、Docker / Nginx / K3s / DB / firewall restart、restore、prune、delete。 +v1.108 Windows99 console artifact reliability blocker rule:若 RDP / Hyper-V VMConnect 畫面可見,但 clipboard paste、視窗 focus 或 stdout 擷取不可靠,不能把畫面片段、截斷 output 或 Windows 端舊剪貼簿內容當成 `windows99-vmware-autostart.ps1 -Mode Verify` artifact。`windows99-management-channel-probe.py` 必須輸出 `console_artifact_status`、`console_artifact_reliable`、`console_artifact_blockers` 與 `console_artifact_safe_next_step`;`blocked_clipboard_unreliable`、`blocked_focus_unreliable`、`blocked_truncated_output` 分別轉成 `windows99_console_clipboard_unreliable`、`windows99_console_focus_unreliable`、`windows99_console_verify_output_truncated`。這些 blocker 必須進 `active_blockers`、`windows99_verify_collection.collection_blockers`、readback、rollups 與 `active_blocker_action_matrix`,owner lane 固定為 `windows99_console_or_no_secret_management_channel`,next safe action 固定為 `stop_unreliable_rdp_clipboard_path_and_use_authorized_no_secret_management_channel_or_validated_console_stdout_artifact`。完成條件仍是取得 validator 可接受的完整 no-secret Verify stdout 或恢復授權 no-secret management channel;不得讀 Windows 密碼、不得啟動 / 關閉 VM、不得重啟 host / service。 + 2026-07-02 110 control-path / Harbor recovery receipt rule:若 Gitea Harbor repair queue 仍保留 `harbor_110_remote_ssh_publickey_auth_stalled`、remote-control unavailable、jobs stale 或 historical failure,但同一輪本地證據同時證明 `wooo` command path ready、110 local Harbor `/v2/` ready、public/internal registry `/v2/` 回 `401`,則該 Gitea Harbor repair 失敗只能列為 historical queue metadata,不得再當成 current SSH blocker。必須用 `/api/v1/agents/harbor-registry-controlled-recovery-receipt` 或同等 validator 合併 `diagnose-110-ssh-publickey-auth.sh`、`recover-110-control-path-and-harbor-local.sh --check`、public Gitea queue readback 與 registry `/v2/` verifier,並把機器可讀結果寫入 `docs/operations/harbor-110-control-path-recovery-readback-2026-07-02.snapshot.json` 類型的 snapshot。2026-07-02 live receipt 顯示:public/internal registry `/v2/` 均為 `401`、latest visible CD `#4335` 為 `Success`、Gitea Harbor repair failure 已是 `historical_after_latest_cd_success=true`;active blockers 收斂為 110 controlled CD lane config / binary / registration / service guardrail、active action container pressure,以及 Gitea CD jobs head-SHA / stale readback mismatch。若 local-console output 只有 `AWOOOI_110_CONTROLLED_CD_LANE_READY` marker,non110 runner parser 不得從 110 `BLOCKER` 行推導 non110 blocker;non110 只有看到 `AWOOOI_NON110_RUNNER_READY` marker 才能列入 active blocker。 2026-07-02 110 controlled CD lane fail-closed enforcer staging rule:110 runner 壓力事故後,legacy / generic runner 仍必須 fail-closed;但 `awoooi-cd-lane-drain.service` 的非 secret staging artifact 不得再被 enforcer 無差別封回 stub。`scripts/reboot-recovery/enforce-110-runner-failclosed.sh` 只有在 `config.yaml` 符合 `capacity <= 1`、只含 `awoooi-host:host` 與 `awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04`、binary 是 executable ELF、systemd unit 具備 `ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner`、`CPUAccounting` / `MemoryAccounting` / `TasksAccounting` / `NoNewPrivileges` 等 guardrail,且 service `inactive`、`MainPID=0`、未 enabled / 未 masked 時,才可保留 drain config / binary / unit,並輸出 `CONTROLLED_DRAIN_STAGING_ALLOWED=1` 與 textfile metric。此 staging 規則不得讀 token、不得讀 `.runner` 內容、不得註冊 runner、不得啟動 service;若 registration 缺失,readiness verifier 仍必須只留下 `controlled_cd_lane_registration_missing` / `controlled_cd_lane_service_not_active` 類 blocker。若 `CONTROLLED_DRAIN_STAGING_ALLOWED=0` 且 config / binary 又被搬走,優先修 source enforcer / unit guardrail,不要手工反覆補同一組 artifact。 diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index d51f2e269..f7bb9b139 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -13,19 +13,19 @@ 本段覆蓋舊的「單次重啟後人工排查」做法。所有後續狀態回報必須依此順序推進;噪音若會遮蔽 P0,就掛回同一列,不另開支線。 -### 2026-07-03 08:59 最新 P0 覆蓋排序 +### 2026-07-03 09:22 最新 P0 覆蓋排序 下表覆蓋 2026-06-30 初始事故列;舊表保留為歷史追蹤。所有新插入需求必須掛在本表,不得再分散成臨時支線。 | 優先 | 狀態 | 工作項 | 最新證據 | 下一步 / 完成條件 | |------|------|--------|----------|-------------------| -| P0-1 | BLOCKED_HOST_WINDOWS | 全主機 reboot auto-detection / auto-trigger / 10 分鐘恢復 SLO | 2026-07-03 08:59 production scorecard:`status=blocked_reboot_auto_recovery_slo_not_ready`、`active_blocker_count=8`、`readiness_percent=67`、`primary_blocker=reboot_event_required_host_unreachable`、`can_claim_all_services_recovered_within_target=false`。no-write artifact `/tmp/awoooi-reboot-continue-20260703-085205`:111 socket probe 仍 timeout / unreachable;99 RDP / VMConnect reachable 但 uptime unknown;188 socket probe reachable,前次 host probe 仍為 `systemd_state=degraded` / `startup_active=failed`。Production readback 同步顯示 `windows99_vmware_readback_present=true`、`windows99_vmware_config_ready=false`、`windows99_vmware_power_ready=false`、`windows99_missing_vmx_aliases=["111"]`、`windows99_powered_off_aliases=["111","112","120","121","188"]`。 | 先收斂 99 / Windows99 / VMware 與 111:恢復 no-secret management channel 或取得可靠 console verify stdout,讀回 VMX / VM power / host uptime,再 rerun host probe + reboot-event detector;不得 reboot、不得 VM power change、不得讀 Windows 密碼。 | +| P0-1 | BLOCKED_HOST_WINDOWS | 全主機 reboot auto-detection / auto-trigger / 10 分鐘恢復 SLO | 2026-07-03 08:59 production scorecard:`status=blocked_reboot_auto_recovery_slo_not_ready`、`active_blocker_count=8`、`readiness_percent=67`、`primary_blocker=reboot_event_required_host_unreachable`、`can_claim_all_services_recovered_within_target=false`。no-write artifact `/tmp/awoooi-reboot-continue-20260703-085205`:111 socket probe 仍 timeout / unreachable;99 RDP / VMConnect reachable 但 uptime unknown;188 socket probe reachable,前次 host probe 仍為 `systemd_state=degraded` / `startup_active=failed`。Production readback 同步顯示 `windows99_vmware_readback_present=true`、`windows99_vmware_config_ready=false`、`windows99_vmware_power_ready=false`、`windows99_missing_vmx_aliases=["111"]`、`windows99_powered_off_aliases=["111","112","120","121","188"]`。09:22 source/API contract 已把 unreliable console artifact 額外投影成 `windows99_console_clipboard_unreliable` 類 active blocker。 | 先收斂 99 / Windows99 / VMware 與 111:恢復 no-secret management channel 或取得 validator 可接受的完整 console Verify stdout,讀回 VMX / VM power / host uptime,再 rerun host probe + reboot-event detector;不得 reboot、不得 VM power change、不得讀 Windows 密碼,不得用 RDP clipboard 片段當完成證據。 | | P0-2 | BLOCKED_EDGE_PRIVILEGED_APPLY | Deploy / reboot 期間 public 502 維護頁與外部 fallback | Gitea CD `#4519` 已推 deploy marker `3aca484 -> a94ddd5`,但 marker 後 public probe 仍讀到 `https://awoooi.wooo.work/api/v1/health` raw `502`、fallback header/body 空;live 188 `/etc/nginx/sites-enabled/awoooi.wooo.work.conf` 缺 maintenance fallback,`/var/www/maintenance/maintenance.html` 缺失,`ollama@188` 無 passwordless sudo。 | 先跑 `scripts/reboot-recovery/public-maintenance-edge-fallback-apply.sh --check` 留 drift receipt;具備 privileged channel 後執行 `--apply`,要求 backup、`nginx -t`、reload、public route probe 全綠。不得讀密碼、不得用 app restart 掩蓋 edge fallback drift。 | | P0-3 | PARTIAL_GREEN_SOURCE_RUNTIME | 所有產品 / 網站版本與資料最新性 | Gitea `main=89d4d6112`;Production SLO readback 對齊 deploy marker `89d4d6112 chore(cd): deploy 17ba08c [skip ci]`;Stock freshness `status=ok`、`latest_trading_date=2026-07-02`、blockers `[]`。AWOOOI health HTTP 200 但整體 `degraded`,SignOz / local Ollama 仍需列為 runtime degraded evidence。 | 將 source SHA / deploy marker / runtime endpoint / public route watch 固定進 scorecard;完成條件是每個 public product 都有 source、deploy、runtime、freshness 四層 readback,且 degraded components 有 owner lane。 | -| P0-4 | BLOCKED_WINDOWS99_AUTOSTART | 192.168.0.99 VMware 自動啟動與 VM guest 111 / 188 / 120 / 121 / 112 | 08:51 no-secret management probe:99 reachable、TCP `22 / 135 / 2179 / 3389 / 445=open`、RDP console / Hyper-V VMConnect reachable,WinRM `5985 / 5986=timeout`,`administrator` SSH BatchMode `permission_denied`,`can_collect_vmware_verify_without_secret=false`。collector check:`ssh_batchmode_auth_ready=0`、`verify_collection_status=blocked_ssh_publickey_auth_missing`、`remote_verify_attempted=0`。08:59 production scorecard 已有 Windows99 runtime readback,但回 `missing_vmx_aliases=["111"]`、guest power not ready;RDP console UI/clipboard 測試顯示 clipboard 會貼入 Windows 端舊內容,不能當可靠自動收集通道。 | 只用可靠 no-secret console / management collector 取得 `windows99-vmware-autostart.ps1 -Mode Verify` stdout,先以 `validate-windows99-console-verify-artifact.py` 轉成 normalized artifact,再重跑 SLO scorecard;完成條件是 VMX config ready、guest power ready、99 uptime known、all required host reachable。 | +| P0-4 | BLOCKED_WINDOWS99_AUTOSTART | 192.168.0.99 VMware 自動啟動與 VM guest 111 / 188 / 120 / 121 / 112 | 08:51 no-secret management probe:99 reachable、TCP `22 / 135 / 2179 / 3389 / 445=open`、RDP console / Hyper-V VMConnect reachable,WinRM `5985 / 5986=timeout`,`administrator` SSH BatchMode `permission_denied`,`can_collect_vmware_verify_without_secret=false`。collector check:`ssh_batchmode_auth_ready=0`、`verify_collection_status=blocked_ssh_publickey_auth_missing`、`remote_verify_attempted=0`。08:59 production scorecard 已有 Windows99 runtime readback,但回 `missing_vmx_aliases=["111"]`、guest power not ready;RDP console UI/clipboard 測試顯示 clipboard 會貼入 Windows 端舊內容,不能當可靠自動收集通道。09:22 已補 `console_artifact_status/reliable/blockers/safe_next_step` 與 API regression,避免下次再把可見畫面誤判成可用 artifact。 | 只用可靠 no-secret console / management collector 取得 `windows99-vmware-autostart.ps1 -Mode Verify` stdout,先以 `validate-windows99-console-verify-artifact.py` 轉成 normalized artifact,再重跑 SLO scorecard;完成條件是 VMX config ready、guest power ready、99 uptime known、all required host reachable。 | | P0-5 | RUNTIME_READY_BACKUP_RECEIPT_GAP | Gitea / 主機 / DB / 網站 / 服務 / 套件 / 工具 / log 備份監控告警 | Gitea repo bundle readback ready:expected `12`、rows `12`、missing `0`、failed `0`、sample restore dry-run ok;backup core green。2026-07-03 source / runtime 已部署 `awoooi_backup_alert_receipt_*` 指標與 Prometheus rules;110 exporter 讀回 88 個 stage requirement、188 讀回 12 個 stage requirement,`BackupAlertReceiptMetricMissing*` inactive,`BackupAlertReceiptStageMissing` 已修成每 `host / receipt_channel` 聚合 pending:110 一條、188 一條。 | 補 `/backup/alert-receipts/*.last_success` 脫敏 marker;下一層仍要補 Gitea full dump、DB/settings/issues/packages/LFS、所有工具與 log 全量備份監控。 | | P0-6 | RUNTIME_READY_ALERT_RECEIPT_GAP | 主機關機 / 重啟 / SLO miss / backup failure Telegram 告警 | Reboot per-blocker alert 與 backup receipt alert rules 已 deploy/readback;backup receipt 缺段不再產生 100 條 stage 噪音,現在聚合成 110 / 188 兩條 host-level pending。scorecard 仍有 8 個 reboot active blockers,尚未完成 shutdown / reboot / backup alert 的 production 脫敏 delivery receipt 全矩陣。 | 補 alert receipt readback:host down、host up、SLO miss、Windows99 blocker、backup stale/failed、deploy 502、freshness stale;完成條件是每類告警都有 sent / received / dedup / escalation evidence。 | -| P0-7 | SOURCE_READY_SLA_AUTOMATION | 固定排查順序、ETA / wait reason、自動化判斷與修復 | Scorecard 已固定 `current_phase=host_boot_detection_blocked`、`eta_or_wait_reason=reboot_event_readback_missing_eta_unavailable`、`primary_blocker=reboot_event_required_host_unreachable`、fixed triage order 與 next safe action;08:23 artifact 固定下一步為 no-secret Windows99 verify / host probe rerun。10 分鐘內自動恢復仍未達標。 | 把每個 blocker 的 next_safe_action、post_verifier、forbidden_actions 接到自動 work item / Telegram / scorecard;完成條件是重啟後自動判斷、主動告警、主動 rerun verifier,不再人工臨場猜流程。 | +| P0-7 | SOURCE_READY_SLA_AUTOMATION | 固定排查順序、ETA / wait reason、自動化判斷與修復 | Scorecard 已固定 `current_phase=host_boot_detection_blocked`、`eta_or_wait_reason=reboot_event_readback_missing_eta_unavailable`、`primary_blocker=reboot_event_required_host_unreachable`、fixed triage order 與 next safe action;08:23 artifact 固定下一步為 no-secret Windows99 verify / host probe rerun。09:22 `active_blocker_action_matrix` 已能把 unreliable console artifact 指向 `windows99_console_or_no_secret_management_channel` 與固定 next safe action。10 分鐘內自動恢復仍未達標。 | 把每個 blocker 的 next_safe_action、post_verifier、forbidden_actions 接到自動 work item / Telegram / scorecard;完成條件是重啟後自動判斷、主動告警、主動 rerun verifier,不再人工臨場猜流程。 | | P0-8 | PARTIAL_READY_POLICY | Windows99 禁止 Windows Update 無預警重啟 | Scorecard:`windows99_update_no_auto_reboot_ready=true`;但 Windows99 no-secret remote execution channel blocked,policy 證據仍需納入持續監控。 | 保留 readback,補週期性 verifier 與 Telegram drift alert;完成條件是 Windows Update policy drift 會自動告警且不需讀 secret。 | | 優先 | 狀態 | 工作項 | 2026-06-30 證據 | 下一步 / 完成條件 | diff --git a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md index e0ebfc0c6..cdf5f368d 100644 --- a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md +++ b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md @@ -57,21 +57,21 @@ | 順序 | ID | 優先序 | 使用者插入要求 | 正規化工作項 | 目前狀態 | 下一個可驗證動作 | | --- | --- | --- | --- | --- | --- | --- | -| 1 | CIR-P0-RBT-001 | P0 | 「主機重啟後 10 分鐘內全部恢復,且要自動判斷所有主機被重啟」 | 建立 99/110/111/112/120/121/188 reboot event detector + 10 分鐘 SLO scorecard + fixed triage order | 2026-07-03 08:23 production scorecard:readiness `67%`、active blockers `8`、`can_claim_all_services_recovered_within_target=false`;artifact `/tmp/awoooi-reboot-verify-only-20260703-082310`:`reboot_detected=false`、`fresh_boot_hosts=[]`、111 不可達、99 uptime unknown、188 startup failed/degraded | 優先收斂 99 local console Verify output / no-secret management channel 與 111 reachability;不可宣稱 10 分鐘 SLA 已證明 | +| 1 | CIR-P0-RBT-001 | P0 | 「主機重啟後 10 分鐘內全部恢復,且要自動判斷所有主機被重啟」 | 建立 99/110/111/112/120/121/188 reboot event detector + 10 分鐘 SLO scorecard + fixed triage order | 2026-07-03 08:59 production scorecard:readiness `67%`、active blockers `8`、`can_claim_all_services_recovered_within_target=false`;artifact `/tmp/awoooi-reboot-continue-20260703-085205`:111 不可達、99 uptime unknown、188 startup failed/degraded;09:22 已把 unreliable console artifact 投影成 machine-readable blocker | 優先收斂 99 local console Verify output / no-secret management channel 與 111 reachability;不可宣稱 10 分鐘 SLA 已證明,不得用 RDP clipboard 片段當完成證據 | | 2 | CIR-P0-RBT-002 | P0 | 「沒有偵測到主機重啟」 | 修正 host reboot/shutdown/up detection:boot_id / uptime / node exporter / Windows exporter / VMware VM power state 都要進同一事件 | Scorecard 已接 collection packet + management probe;08:23 host probe 有 7 列但 99 只有 ping/RDP reachable、uptime unknown,111 unreachable,112/120/121/188 uptime 都已超過 10 分鐘窗口 | 讓 99 verifier / Windows exporter 或等效 no-secret readback 進入 host boot event,並補 111 reachability 證據 | -| 3 | CIR-P0-RBT-003 | P0 | 「192.168.0.99 VMWare 要自動啟動,裡面 111/188/120/121/112 也自動啟動」 | Windows 99 VMware host autostart + guest VM autostart contract;VM host 111/188/120/121/112 開機順序與 readback | Source verifier / parser / API readback / collection packet 已完成;08:23 management probe 讀回 99 host reachable、RDP / Hyper-V VMConnect reachable,但 SSH BatchMode `permission_denied`、WinRM timeout,collector `verify_collection_status=blocked_ssh_publickey_auth_missing`、`remote_verify_attempted=0` | 收集 local console Verify output 或恢復 no-secret management channel,再確認 `VMRUN_PRESENT`、scheduled task、VMware services、VM power、VMX present 全綠 | +| 3 | CIR-P0-RBT-003 | P0 | 「192.168.0.99 VMWare 要自動啟動,裡面 111/188/120/121/112 也自動啟動」 | Windows 99 VMware host autostart + guest VM autostart contract;VM host 111/188/120/121/112 開機順序與 readback | Source verifier / parser / API readback / collection packet 已完成;08:59 management probe 讀回 99 host reachable、RDP / Hyper-V VMConnect reachable,但 SSH BatchMode `permission_denied`、WinRM timeout,collector `verify_collection_status=blocked_ssh_publickey_auth_missing`、`remote_verify_attempted=0`;09:22 已新增 `console_artifact_status/reliable/blockers/safe_next_step` | 收集 validator 可接受的完整 local console Verify output 或恢復 no-secret management channel,再確認 `VMRUN_PRESENT`、scheduled task、VMware services、VM power、VMX present 全綠 | | 4 | CIR-P0-RBT-004 | P0 | 「192.168.0.99 不可因 Windows Update 無預警重開」 | Windows Update reboot policy:active hours / no auto-restart / maintenance window / update notification audit | Source verifier 已補 `WINDOWS_UPDATE_POLICY` 與 `WINDOWS_UPDATE_NO_AUTO_REBOOT_READY`;collection packet 已列 forbidden actions;99 management channel 尚不能收 policy readback | 取得 Verify output;若 policy 不綠,再走 controlled apply,禁止要求或記錄 Windows 密碼 | | 5 | CIR-P0-RBT-005 | P0 | 「網站重啟後 502 嚴重影響體驗,要維護頁,外部雲端或專業做法」 | Public maintenance fallback:Nginx / edge / external static maintenance page / status page / fail-open UX,避免 502 直出 | Source + runtime verifier 已實作;Gitea CD `#4459` Success、deploy marker `8d7a6faaf`,production API scorecard 已讀回 `public_maintenance_fallback.ready=true`、raw 5xx=`0`、unreachable without L1=`0`,P0 blockers `11`、readiness `47`;仍不可宣稱整體 10 分鐘 SLO 完成 | 若未來 public route 出現 raw 5xx,先走 L0 Nginx intercept / header verifier;若 edge unreachable,才產生 L1 external static origin / CDN decision record + rollback | | 6 | CIR-P0-RBT-006 | P0 | 「所有主機關機立刻 Telegram 告警,重啟後也要告警,其他告警一併完整思考」 | Down / shutdown suspected / reboot detected / reboot recovered / SLO missed / backup failed / freshness stale / CPU pressure / Gitea queue 告警矩陣 | HostDown / HostRebootEventDetected / RebootAutoRecoverySLOMissed 已存在;per-blocker reboot alerts 與 backup receipt rules 已 deploy/readback。Backup receipt 缺段已從 100 條 stage 噪音收斂為 110 / 188 兩條 host-level pending;仍需完整 shutdown/up E2E receipt | 補 Prometheus / Alertmanager active/resolved 與 outbound receipt;backup alert 先補 `/backup/alert-receipts/*.last_success` 脫敏 marker,不送測試 secret、不重啟主機 | | 7 | CIR-P0-RBT-007 | P0 | 「所有備份包含主機、DB、網站、服務、套件、工具、日誌都沒有監控告警」 | Backup observability coverage:backup job inventory、last success、freshness、offsite、restore drill、Telegram/AwoooP receipt | 已有 backup health exporter / alert rules / Gitea bundle restore dry-run;2026-07-03 runtime 讀回 110 有 88 個 receipt stage requirement、188 有 12 個,`BackupAlertReceiptMetricMissing*` inactive,`BackupAlertReceiptStageMissing` 聚合 pending 110 / 188 各一條 | 補 `/backup/alert-receipts/*.last_success`;再補 Gitea full dump / DB / settings / issues / packages / LFS 與所有工具/log 全量備份監控 | -| 8 | CIR-P0-RBT-008 | P0 | 「每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA」 | 固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command | Production scorecard readback 已固定 `status=blocked_reboot_auto_recovery_slo_not_ready`、readiness `67%`、active blockers `8`、primary `reboot_event_required_host_unreachable`;08:23 artifact 固定 next action 為 99 no-secret Verify / 111 reachability / host probe detector rerun | 優先收斂 99 no-secret Verify / 111 reachability / 188 startup failed/degraded;不得用不同排查路徑繞過 scorecard | +| 8 | CIR-P0-RBT-008 | P0 | 「每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA」 | 固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command | Production scorecard readback 已固定 `status=blocked_reboot_auto_recovery_slo_not_ready`、readiness `67%`、active blockers `8`、primary `reboot_event_required_host_unreachable`;09:22 source/API contract 已把 unreliable console artifact 接到 `active_blocker_action_matrix`、owner lane 與 next safe action | 優先收斂 99 no-secret Verify / 111 reachability / 188 startup failed/degraded;不得用不同排查路徑繞過 scorecard | | 9 | CIR-P0-RBT-009 | P0 | 「所有產品、網站都要是最新版本;版本和數據是否最新要驗證」 | Product freshness/version matrix:source commit、deploy marker、runtime image、public health、data freshness、latest source availability | AWOOOI Gitea main 已到 deploy marker `c68b74686`,production source readback `5d5bc86fed` verified;StockPlatform public freshness / ingestion 讀回 `ok`,latest trading date `2026-07-02`,core price/chips/margin/AI recommendations 都是 `2026-07-02` | 建立全產品 readback 表:product、canonical repo、main SHA、deploy marker、public URL、data freshness、blocked reason | | 10 | CIR-P0-GIT-001 | P0 | 「Gitea 儲存庫都不見了?Gitea 沒完整備份嗎?」 | Gitea repository identity + backup proof + restore drill:不能只看 UI visible,要比對 SSH heads、repo path、bundle backup、restore sample | 2026-07-02 production `/api/v1/agents/gitea-repo-bundle-backup-readback` 已 ready:9 expected repos present/ok、missing=0、failed=0、checksum_missing=0、bundle_fresh=true、all_expected_ok=true、sample_restore_dry_run_ok=true;repo bundle / restore dry-run 層已關閉,不是 repo missing。 | 維持每日 bundle backup + restore dry-run monitoring;另補 Gitea full dump / DB / settings / issues / packages / LFS 備份 readback。禁止刪 repo / 改 visibility / 讀 token / restore 到 production | | 11 | CIR-P0-CPU-001 | P0 | 「110 / 188 CPU 負載持續過高,為什麼沒監控告警、沒主動修復」 | Sustained CPU pressure automation:Alertmanager → controller → evidence → service playbook → verifier → KM writeback | 110 已有 `Host110SustainedModeratePressure`、Gitea playbook、Stock/Postgres evidence;188 仍需同級 controller/alerts readback | 下一步接 `postgres_hot_query_or_backup_export_playbook`;並補 188 equivalent readback,不以單次下降結案 | | 12 | CIR-P0-CPU-002 | P0 | 「噪音會影響真問題,要整合一起做」 | Alert noise / real issue correlation:backup aggregate noise、CPU pressure、Gitea queue、Stock freshness 要分清主因與次因 | 部分已在 SOP 註記;仍需統一 correlation scorecard | 建立 incident correlation readback:primary_blocker、secondary_noise、ignored_noise_reason、evidence_ref | | 13 | CIR-P0-CD-001 | P0 | 「所有專案都不能推版 / 要看到實作結果」 | Gitea-only CD baseline:每次 main push 要有 visible run、deploy marker、production readback;GitHub 不作解法 | AWOOOI 最新 main 可推,CD success/deploy marker 已多次證明;全產品未全綠 | 將 product governance matrix 接入各產品 Gitea CD readiness,不再只報 AWOOOI | | 14 | CIR-P1-AI-001 | P1 | 「AI 專業在哪?要能主動發現、主動修復」 | AI controlled repair loop:detect → classify → candidate → check-mode → controlled apply → post verifier → KM / PlayBook trust | CPU / Gitea / Telegram receipt 已部分落地;全域 AI loop 未全部接上 | 將每個 P0 runbook 補 `candidate_action`、`controlled_apply_allowed`、`post_verifier`、`trust_writeback` | -| 15 | CIR-P1-KM-001 | P1 | 「修復過程、經驗完整沉澱進 SOP,整合到目前版本」 | 所有 P0 修復必須同步 LOGBOOK、SOP、PlayBook、workplan ledger;不能只留在對話 | 本台帳、LOGBOOK、SOP 已開始補;仍需 API/UI read model | 把本台帳轉成 read-only API / governance UI row,並建立 `last_updated` / `evidence_count` | +| 15 | CIR-P1-KM-001 | P1 | 「修復過程、經驗完整沉澱進 SOP,整合到目前版本」 | 所有 P0 修復必須同步 LOGBOOK、SOP、PlayBook、workplan ledger;不能只留在對話 | 本台帳、LOGBOOK、SOP 已開始補;09:22 已把 Windows99 console clipboard 不可靠經驗寫入 SOP v1.108、P0 workplan 與 scorecard regression;仍需 API/UI read model | 把本台帳轉成 read-only API / governance UI row,並建立 `last_updated` / `evidence_count` | | 16 | CIR-P1-WORK-001 | P1 | 「所有已開始、進行中、已完成工作全部看清楚」 | 工作狀態盤點:Done / In Progress / Blocked / Deferred / Next Action + evidence | 本台帳已有初版 Done/In Progress/Blocked;需納入本節新 P0 | 更新下方 Done/In Progress/Blocked,把 reboot/backup/VMware/maintenance/CPU 全列入 | | 17 | CIR-P1-OPENCLAW-001 | P1 | 「OpenClaw 要有像 Gather Town 的工作畫面,而且是持續動畫」 | OpenClaw Live Ops Space:animated room、agent avatars、flow packets、work tokens、scene-state API、desktop/mobile smoke | `/zh-TW/openclaw/live-ops-space` route 已存在;仍需 production desktop/mobile animation smoke 與 AwoooP 導流 | 跑 OpenClaw live-ops desktop/mobile smoke,補 AwoooP 導流與截圖證據 | | 18 | CIR-P2-OBS-001 | P2 | 「其他還有哪些告警也要完整思考」 | Observability coverage expansion:SignOz/Sentry/Langfuse/Harbor/Registry/K3s/DB/backup/freshness/route/TLS 告警 | 多數 rule 分散存在;coverage matrix 不完整 | 建立 alert coverage matrix,區分 P0 actionable 與 P2 observability debt | diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index ec757e3fe..018523495 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -299,6 +299,10 @@ def parse_windows99_management_readback(path: Path | None) -> dict[str, Any]: "rdp_console_reachable": False, "local_console_channel_reachable": False, "console_collection_channels": [], + "console_artifact_status": "unknown", + "console_artifact_reliable": False, + "console_artifact_blockers": [], + "console_artifact_safe_next_step": "", "remote_execution_channel_ready": False, "can_collect_vmware_verify_without_secret": False, "blockers": [], @@ -345,6 +349,16 @@ def parse_windows99_management_readback(path: Path | None) -> dict[str, Any]: "console_collection_channels": strings( payload.get("console_collection_channels") ), + "console_artifact_status": str( + payload.get("console_artifact_status") or "unknown" + ), + "console_artifact_reliable": payload.get("console_artifact_reliable") is True, + "console_artifact_blockers": strings( + payload.get("console_artifact_blockers") + ), + "console_artifact_safe_next_step": str( + payload.get("console_artifact_safe_next_step") or "" + ), "remote_execution_channel_ready": ( payload.get("remote_execution_channel_ready") is True ), @@ -1024,6 +1038,15 @@ def build_windows99_verify_collection_packet( windows99_management.get("local_console_channel_reachable") is True ) console_channels = strings(windows99_management.get("console_collection_channels")) + console_artifact_status = str( + windows99_management.get("console_artifact_status") or "unknown" + ) + console_artifact_reliable = ( + windows99_management.get("console_artifact_reliable") is True + ) + console_artifact_blockers = strings( + windows99_management.get("console_artifact_blockers") + ) collector_present = windows99_collector.get("readback_present") is True collector_status = str(windows99_collector.get("status") or "unknown") collector_ssh_ready = ( @@ -1044,6 +1067,11 @@ def build_windows99_verify_collection_packet( for blocker in strings(windows99_collector.get("blockers")) if blocker not in collection_blockers ) + collection_blockers.extend( + blocker + for blocker in console_artifact_blockers + if blocker not in collection_blockers + ) if not host99_uptime_known: collection_blockers.append("windows99_uptime_unknown") @@ -1081,6 +1109,12 @@ def build_windows99_verify_collection_packet( ) ), "available_collection_channels": available_channels, + "console_artifact_status": console_artifact_status, + "console_artifact_reliable": console_artifact_reliable, + "console_artifact_blockers": console_artifact_blockers, + "console_artifact_safe_next_step": str( + windows99_management.get("console_artifact_safe_next_step") or "" + ), "no_secret_collector_readback_present": collector_present, "no_secret_collector_status": collector_status, "no_secret_collector_safe_next_step": str( @@ -1325,6 +1359,9 @@ def reboot_sop_current_phase(active_blockers: list[str], can_claim: bool) -> str "windows99_vmware_autostart_config_not_ready", "windows99_vmware_guest_power_not_ready", "windows99_update_no_auto_reboot_policy_not_ready", + "windows99_console_clipboard_unreliable", + "windows99_console_focus_unreliable", + "windows99_console_verify_output_truncated", } if any(blocker in host_boot_blockers for blocker in active_blockers): return "host_boot_detection_blocked" @@ -1373,6 +1410,9 @@ def reboot_sop_primary_blocker(active_blockers: list[str]) -> str: "windows99_vmware_autostart_config_not_ready", "windows99_vmware_guest_power_not_ready", "windows99_update_no_auto_reboot_policy_not_ready", + "windows99_console_clipboard_unreliable", + "windows99_console_focus_unreliable", + "windows99_console_verify_output_truncated", "public_route_raw_5xx_without_maintenance_fallback", "public_route_unreachable_without_external_l1_fallback", "public_maintenance_fallback_runtime_readback_missing", @@ -1519,6 +1559,15 @@ def active_blocker_action_row( "restore_windows99_no_secret_management_channel_or_collect_local_" "console_verify_readback_then_rerun_reboot_scorecard_no_reboot" ) + elif blocker in { + "windows99_console_clipboard_unreliable", + "windows99_console_focus_unreliable", + "windows99_console_verify_output_truncated", + }: + next_safe_action = ( + "stop_unreliable_rdp_clipboard_path_and_use_authorized_no_secret_" + "management_channel_or_validated_console_stdout_artifact" + ) post_verifier = ( "bash scripts/reboot-recovery/collect-windows99-vmware-verify.sh " "--check && rerun_reboot_auto_recovery_slo_scorecard" @@ -1883,6 +1932,18 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: "windows99_available_collection_channels": strings( windows99_verify_collection.get("available_collection_channels") ), + "windows99_console_artifact_status": str( + windows99_verify_collection.get("console_artifact_status") or "unknown" + ), + "windows99_console_artifact_reliable": ( + windows99_verify_collection.get("console_artifact_reliable") is True + ), + "windows99_console_artifact_blockers": strings( + windows99_verify_collection.get("console_artifact_blockers") + ), + "windows99_console_artifact_safe_next_step": str( + windows99_verify_collection.get("console_artifact_safe_next_step") or "" + ), "windows99_host99_reachable": ( windows99_verify_collection["host99_reachable"] is True ), @@ -1968,6 +2029,18 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: "windows99_available_collection_channels": rollups[ "windows99_available_collection_channels" ], + "windows99_console_artifact_status": rollups[ + "windows99_console_artifact_status" + ], + "windows99_console_artifact_reliable": rollups[ + "windows99_console_artifact_reliable" + ], + "windows99_console_artifact_blockers": rollups[ + "windows99_console_artifact_blockers" + ], + "windows99_console_artifact_safe_next_step": rollups[ + "windows99_console_artifact_safe_next_step" + ], "windows99_remote_execution_channel_ready": rollups[ "windows99_remote_execution_channel_ready" ], @@ -2173,6 +2246,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: blockers.extend(strings(host_pressure.get("blockers"))) blockers.extend(strings(public_maintenance.get("blockers"))) blockers.extend(strings(windows99.get("blockers"))) + blockers.extend(strings(windows99_management.get("console_artifact_blockers"))) if ( windows99.get("readback_present") is False and windows99_management.get("readback_present") is True diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index 6117523b8..a75167832 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -598,6 +598,56 @@ def test_windows99_management_channel_unavailable_is_visible(tmp_path: Path) -> assert payload["readback"]["windows99_remote_execution_channel_ready"] is False +def test_windows99_console_artifact_blocker_is_visible(tmp_path: Path) -> None: + management = { + **WINDOWS99_MANAGEMENT_BLOCKED, + "console_artifact_status": "blocked_clipboard_unreliable", + "console_artifact_reliable": False, + "console_artifact_blockers": ["windows99_console_clipboard_unreliable"], + "console_artifact_safe_next_step": ( + "use_authorized_no_secret_management_channel_or_manual_console_stdout_capture" + ), + "blockers": [ + *WINDOWS99_MANAGEMENT_BLOCKED["blockers"], + "windows99_console_clipboard_unreliable", + ], + } + payload = run_scorecard( + tmp_path, + GREEN_SUMMARY, + windows99="", + windows99_management=json.dumps(management), + ) + + assert "windows99_console_clipboard_unreliable" in payload["active_blockers"] + collection = payload["windows99_verify_collection"] + assert collection["console_artifact_status"] == "blocked_clipboard_unreliable" + assert collection["console_artifact_reliable"] is False + assert collection["console_artifact_blockers"] == [ + "windows99_console_clipboard_unreliable" + ] + assert "windows99_console_clipboard_unreliable" in collection[ + "collection_blockers" + ] + assert payload["rollups"]["windows99_console_artifact_status"] == ( + "blocked_clipboard_unreliable" + ) + assert payload["rollups"]["windows99_console_artifact_reliable"] is False + assert payload["readback"]["windows99_console_artifact_blockers"] == [ + "windows99_console_clipboard_unreliable" + ] + action_by_blocker = { + item["blocker"]: item + for item in payload["active_blocker_action_matrix"]["items"] + } + assert action_by_blocker["windows99_console_clipboard_unreliable"][ + "next_safe_action" + ] == ( + "stop_unreliable_rdp_clipboard_path_and_use_authorized_no_secret_" + "management_channel_or_validated_console_stdout_artifact" + ) + + def test_windows99_no_secret_collector_publickey_blocker_is_visible( tmp_path: Path, ) -> None: diff --git a/scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py b/scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py index 7d78eafb7..5743d5828 100644 --- a/scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py +++ b/scripts/reboot-recovery/tests/test_windows99_management_channel_probe.py @@ -51,6 +51,8 @@ def test_management_probe_surfaces_no_secret_console_channels(monkeypatch): ssh_timeout=1, ports=None, skip_ssh=False, + console_artifact_status="not_attempted", + console_artifact_blockers=None, generated_at="2026-07-02T16:00:00+08:00", output=None, ) @@ -65,6 +67,9 @@ def test_management_probe_surfaces_no_secret_console_channels(monkeypatch): "rdp_console", "hyperv_vmconnect", ] + assert payload["console_artifact_status"] == "not_attempted" + assert payload["console_artifact_reliable"] is False + assert payload["console_artifact_blockers"] == [] assert payload["remote_execution_channel_ready"] is False assert payload["can_collect_vmware_verify_without_secret"] is False assert "windows99_remote_execution_channel_unavailable" in payload["blockers"] @@ -105,6 +110,8 @@ def test_management_probe_surfaces_multiple_ssh_candidates(monkeypatch): ssh_timeout=1, ports=None, skip_ssh=False, + console_artifact_status="not_attempted", + console_artifact_blockers=None, generated_at="2026-07-02T16:00:00+08:00", output=None, ) @@ -129,3 +136,51 @@ def test_management_probe_surfaces_multiple_ssh_candidates(monkeypatch): "status": "ready", }, ] + + +def test_management_probe_surfaces_console_artifact_clipboard_blocker(monkeypatch): + module = _load_module() + + def fake_tcp_status(_host: str, port: int, _timeout: float) -> str: + return "open" if port in {22, 2179, 3389} else "timeout" + + monkeypatch.setattr(module, "tcp_status", fake_tcp_status) + monkeypatch.setattr( + module, + "ping_status", + lambda _host: {"checked": True, "ok": True, "status": "ok"}, + ) + monkeypatch.setattr( + module, + "ssh_batch_status", + lambda *_args, **_kwargs: { + "checked": True, + "ready": False, + "status": "permission_denied", + }, + ) + + payload = module.build_payload( + argparse.Namespace( + host="192.168.0.99", + ssh_users=["administrator"], + tcp_timeout=0.01, + ssh_timeout=1, + ports=None, + skip_ssh=False, + console_artifact_status="blocked_clipboard_unreliable", + console_artifact_blockers=None, + generated_at="2026-07-03T09:20:00+08:00", + output=None, + ) + ) + + assert payload["console_artifact_status"] == "blocked_clipboard_unreliable" + assert payload["console_artifact_reliable"] is False + assert payload["console_artifact_blockers"] == [ + "windows99_console_clipboard_unreliable" + ] + assert "windows99_console_clipboard_unreliable" in payload["blockers"] + assert payload["console_artifact_safe_next_step"] == ( + "use_authorized_no_secret_management_channel_or_manual_console_stdout_capture" + ) diff --git a/scripts/reboot-recovery/windows99-management-channel-probe.py b/scripts/reboot-recovery/windows99-management-channel-probe.py index 7c74e198a..7a631711a 100644 --- a/scripts/reboot-recovery/windows99-management-channel-probe.py +++ b/scripts/reboot-recovery/windows99-management-channel-probe.py @@ -45,6 +45,28 @@ def parse_args() -> argparse.Namespace: help="TCP port to probe. May be passed more than once.", ) parser.add_argument("--skip-ssh", action="store_true") + parser.add_argument( + "--console-artifact-status", + default="not_attempted", + choices=[ + "not_attempted", + "blocked_clipboard_unreliable", + "blocked_focus_unreliable", + "blocked_truncated_output", + "collected_stdout", + "validated_artifact", + ], + help=( + "Optional no-secret console artifact collection status. Use a blocked " + "value only after an attempted console stdout capture path fails." + ), + ) + parser.add_argument( + "--console-artifact-blocker", + action="append", + dest="console_artifact_blockers", + help="Optional machine-readable console artifact blocker. May be repeated.", + ) parser.add_argument("--generated-at", help="Override generated_at.") parser.add_argument("--output", type=Path, help="Write JSON to this path.") args = parser.parse_args() @@ -204,6 +226,24 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: ssh_probe = dict(ready_candidate or ssh_batch_candidates[0]) ssh_probe.pop("user", None) remote_execution_ready = ssh_probe["ready"] is True + console_artifact_status = str( + getattr(args, "console_artifact_status", "not_attempted") + or "not_attempted" + ) + console_artifact_blockers = list( + getattr(args, "console_artifact_blockers", None) or [] + ) + if console_artifact_status == "blocked_clipboard_unreliable": + console_artifact_blockers.append("windows99_console_clipboard_unreliable") + elif console_artifact_status == "blocked_focus_unreliable": + console_artifact_blockers.append("windows99_console_focus_unreliable") + elif console_artifact_status == "blocked_truncated_output": + console_artifact_blockers.append("windows99_console_verify_output_truncated") + console_artifact_blockers = list(dict.fromkeys(console_artifact_blockers)) + console_artifact_reliable = console_artifact_status in { + "collected_stdout", + "validated_artifact", + } blockers: list[str] = [] if not host_reachable: blockers.append("windows99_host_unreachable_from_management_probe") @@ -213,6 +253,7 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: blockers.append("windows99_winrm_unavailable") if tcp_ports.get("22") == "open" and ssh_probe["status"] == "permission_denied": blockers.append("windows99_ssh_batch_denied") + blockers.extend(console_artifact_blockers) return { "schema_version": SCHEMA_VERSION, @@ -243,6 +284,18 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: "rdp_console_reachable": rdp_console_reachable, "local_console_channel_reachable": local_console_channel_reachable, "console_collection_channels": console_collection_channels, + "console_artifact_status": console_artifact_status, + "console_artifact_reliable": console_artifact_reliable, + "console_artifact_blockers": console_artifact_blockers, + "console_artifact_safe_next_step": ( + "validate_collected_console_stdout_then_rerun_reboot_scorecard" + if console_artifact_reliable + else ( + "use_authorized_no_secret_management_channel_or_manual_console_stdout_capture" + if console_artifact_blockers + else "attempt_console_stdout_capture_only_if_focus_and_clipboard_are_reliable" + ) + ), "remote_execution_channel_ready": remote_execution_ready, "can_collect_vmware_verify_without_secret": remote_execution_ready, "blockers": blockers,