diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 27913306..b5a357d0 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -667,6 +667,8 @@ jobs: ;; scripts/reboot-recovery/windows99-vmware-autostart.ps1) ;; + scripts/reboot-recovery/windows99-management-channel-probe.py) + ;; scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py) ;; scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py) @@ -831,6 +833,7 @@ jobs: ../../scripts/reboot-recovery/post-reboot-owner-response-preflight.py \ ../../scripts/reboot-recovery/momo-source-arrival-gate.py \ ../../scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py \ + ../../scripts/reboot-recovery/windows99-management-channel-probe.py \ ../../scripts/ops/backup-alert-label-contract-check.py \ ../../scripts/ops/backup-health-textfile-exporter.py \ ../../scripts/ops/docker-disk-pressure-retention-cleanup.py \ diff --git a/apps/api/src/services/awoooi_priority_work_order_readback.py b/apps/api/src/services/awoooi_priority_work_order_readback.py index 99602cf6..a04e7d8d 100644 --- a/apps/api/src/services/awoooi_priority_work_order_readback.py +++ b/apps/api/src/services/awoooi_priority_work_order_readback.py @@ -2564,6 +2564,8 @@ def _enrich_from_current_readbacks(payload: dict[str, Any]) -> None: windows99_verify_collection = _dict( reboot_slo.get("windows99_verify_collection") ) + windows99_management = _dict(reboot_slo.get("windows99_management_channel")) + windows99_ssh_batch = _dict(windows99_management.get("ssh_batch")) reboot_preflight_rollups = _dict(reboot_preflight.get("rollups")) reboot_preflight_target_selector = _dict(reboot_preflight.get("target_selector")) reboot_active_blockers = _strings(reboot_slo.get("active_blockers")) @@ -2657,6 +2659,27 @@ def _enrich_from_current_readbacks(payload: dict[str, Any]) -> None: state["windows99_verify_collection_post_verifier"] = str( windows99_verify_collection.get("post_verifier") or "" ) + state["windows99_management_readback_present"] = bool( + windows99_management.get("readback_present") is True + ) + state["windows99_management_host_reachable"] = bool( + windows99_management.get("host_reachable") is True + ) + state["windows99_remote_execution_channel_ready"] = bool( + windows99_management.get("remote_execution_channel_ready") is True + ) + state["windows99_ssh_batch_status"] = str( + windows99_ssh_batch.get("status") or "unknown" + ) + state["windows99_rdp_console_reachable"] = bool( + windows99_management.get("rdp_console_reachable") is True + ) + state["windows99_winrm_http_open"] = bool( + windows99_management.get("winrm_http_open") is True + ) + state["windows99_winrm_https_open"] = bool( + windows99_management.get("winrm_https_open") is True + ) state["stale_snapshot_or_old_cd_runs_must_not_reopen_closed_work"] = True state["p0_004_template_copy_apply_gate_production_http_status"] = 200 state["p0_004_template_copy_apply_gate_runtime_readback_state"] = ( @@ -2829,6 +2852,23 @@ def _enrich_from_current_readbacks(payload: dict[str, Any]) -> None: evidence["windows99_verify_collection_host99_uptime_known"] = bool( windows99_verify_collection.get("host99_uptime_known") is True ) + evidence["windows99_management_readback_present"] = state[ + "windows99_management_readback_present" + ] + evidence["windows99_management_host_reachable"] = state[ + "windows99_management_host_reachable" + ] + evidence["windows99_remote_execution_channel_ready"] = state[ + "windows99_remote_execution_channel_ready" + ] + evidence["windows99_ssh_batch_status"] = state[ + "windows99_ssh_batch_status" + ] + evidence["windows99_rdp_console_reachable"] = state[ + "windows99_rdp_console_reachable" + ] + evidence["windows99_winrm_http_open"] = state["windows99_winrm_http_open"] + evidence["windows99_winrm_https_open"] = state["windows99_winrm_https_open"] evidence["drill_preflight_status"] = str(reboot_preflight.get("status") or "") evidence["drill_preflight_ready"] = ( reboot_preflight_rollups.get("preflight_ready") is True @@ -2892,7 +2932,8 @@ def _enrich_from_current_readbacks(payload: dict[str, Any]) -> None: "controlled service/data/backup readback visible as the source " "selector for these blockers. For Windows 99, use the " "windows99_verify_collection packet to collect no-secret Verify " - "stdout and rerun the scorecard. For StockPlatform " + "stdout, keep windows99_management_channel visible for the current " + "remote execution blocker, and rerun the scorecard. For StockPlatform " "freshness/ingestion postgres_not_ready, use the production " "migration/control-channel path with target selector, dry-run, " "rollback, public API verifier, and KM/RAG/MCP/PlayBook writeback. " @@ -3092,6 +3133,18 @@ def _set_rollups_and_summary( "windows99_verify_collection_host99_uptime_known": ( state.get("windows99_verify_collection_host99_uptime_known") is True ), + "windows99_management_readback_present": ( + state.get("windows99_management_readback_present") is True + ), + "windows99_remote_execution_channel_ready": ( + state.get("windows99_remote_execution_channel_ready") is True + ), + "windows99_ssh_batch_status": str( + state.get("windows99_ssh_batch_status") or "unknown" + ), + "windows99_rdp_console_reachable": ( + state.get("windows99_rdp_console_reachable") is True + ), "p0_004_runtime_readback_ready": p0_004_ready, "reboot_drill_preflight_runtime_readback_ready": ( state.get("reboot_drill_preflight_runtime_readback_state") == "ready" @@ -3148,6 +3201,27 @@ def _set_rollups_and_summary( "windows99_verify_collection_post_verifier": str( state.get("windows99_verify_collection_post_verifier") or "" ), + "windows99_management_readback_present": ( + state.get("windows99_management_readback_present") is True + ), + "windows99_management_host_reachable": ( + state.get("windows99_management_host_reachable") is True + ), + "windows99_remote_execution_channel_ready": ( + state.get("windows99_remote_execution_channel_ready") is True + ), + "windows99_ssh_batch_status": str( + state.get("windows99_ssh_batch_status") or "unknown" + ), + "windows99_rdp_console_reachable": ( + state.get("windows99_rdp_console_reachable") is True + ), + "windows99_winrm_http_open": ( + state.get("windows99_winrm_http_open") is True + ), + "windows99_winrm_https_open": ( + state.get("windows99_winrm_https_open") is True + ), "latest_successful_deployed_source_sha": latest_source_sha, "latest_successful_deployed_source_short_sha": latest_source_sha[:10], "latest_successful_deploy_marker": str( diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 8e9f8e39..03df015f 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -43,6 +43,8 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: post_reboot_readiness = _dict(scorecard.get("post_reboot_readiness")) stockplatform = _dict(scorecard.get("stockplatform_data_freshness")) windows99 = _dict(scorecard.get("windows99_vmware_autostart")) + windows99_management = _dict(scorecard.get("windows99_management_channel")) + windows99_ssh_batch = _dict(windows99_management.get("ssh_batch")) source_controls = _dict(scorecard.get("source_controls")) active_blockers = _strings(scorecard.get("active_blockers")) required_checks = { @@ -53,6 +55,10 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows_update_no_auto_reboot_ready": ( windows99.get("windows_update_no_auto_reboot_ready") is True ), + "windows99_management_channel_ready": ( + windows99.get("verify_ready") is True + or windows99_management.get("remote_execution_channel_ready") is True + ), "service_green": post_reboot_readiness.get("service_green") is True, "product_data_green": post_reboot_readiness.get("product_data_green") is True, "backup_core_green": post_reboot_readiness.get("backup_core_green") is True, @@ -197,6 +203,31 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_host99_uptime_known": ( windows99_verify_collection["host99_uptime_known"] is True ), + "windows99_management_readback_present": ( + windows99_management.get("readback_present") is True + ), + "windows99_host_reachable": ( + windows99_management.get("host_reachable") is True + ), + "windows99_remote_execution_channel_ready": ( + windows99_management.get("remote_execution_channel_ready") is True + ), + "windows99_can_collect_vmware_verify_without_secret": ( + windows99_management.get("can_collect_vmware_verify_without_secret") is True + ), + "windows99_ssh_batch_ready": windows99_ssh_batch.get("ready") is True, + "windows99_ssh_batch_status": str( + windows99_ssh_batch.get("status") or "unknown" + ), + "windows99_winrm_http_open": ( + windows99_management.get("winrm_http_open") is True + ), + "windows99_winrm_https_open": ( + windows99_management.get("winrm_https_open") is True + ), + "windows99_rdp_console_reachable": ( + windows99_management.get("rdp_console_reachable") is True + ), } return { "schema_version": _API_SCHEMA_VERSION, @@ -238,6 +269,10 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_update_no_auto_reboot_ready": rollups[ "windows99_update_no_auto_reboot_ready" ], + "windows99_remote_execution_channel_ready": rollups[ + "windows99_remote_execution_channel_ready" + ], + "windows99_ssh_batch_status": rollups["windows99_ssh_batch_status"], "readback": { "workplan_id": "P0-006", "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", @@ -273,6 +308,10 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "windows99_verify_collection_can_collect_no_secret": rollups[ "windows99_verify_collection_can_collect_no_secret" ], + "windows99_remote_execution_channel_ready": rollups[ + "windows99_remote_execution_channel_ready" + ], + "windows99_ssh_batch_status": rollups["windows99_ssh_batch_status"], }, "reboot_sop_progress": sop_progress, "controlled_service_data_backup_readback": ( @@ -283,6 +322,7 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "stockplatform_data_freshness": stockplatform, "windows99_vmware_autostart": windows99, "windows99_verify_collection": windows99_verify_collection, + "windows99_management_channel": windows99_management, "source_controls": source_controls, "active_blockers": active_blockers, "required_checks": required_checks, @@ -547,6 +587,7 @@ def _reboot_sop_current_phase(active_blockers: list[str], can_claim_slo: bool) - "fresh_all_host_reboot_event_missing", "all_required_hosts_not_in_10_minute_reboot_window", "windows99_vmware_autostart_readback_missing", + "windows99_remote_execution_channel_unavailable", "windows99_vmrun_missing", "windows99_vmware_vmx_missing", "windows99_vmware_autostart_config_not_ready", @@ -593,6 +634,7 @@ def _reboot_sop_primary_blocker(active_blockers: list[str]) -> str: "host_boot_observation_older_than_target_window", "host_uptime_unknown", "windows99_vmware_autostart_readback_missing", + "windows99_remote_execution_channel_unavailable", "windows99_vmrun_missing", "windows99_vmware_vmx_missing", "windows99_vmware_autostart_config_not_ready", diff --git a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py index 554237e6..ad1a8e4c 100644 --- a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py +++ b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py @@ -58,7 +58,7 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): "blockers_open" ) assert payload["mainline_execution_state"]["active_p0_immediate_apply_gap_count"] == 0 - assert payload["mainline_execution_state"]["active_p0_readiness_percent"] == 15 + assert payload["mainline_execution_state"]["active_p0_readiness_percent"] == 43 assert ( payload["mainline_execution_state"][ "controlled_service_data_backup_readback_present" @@ -75,19 +75,15 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): payload["mainline_execution_state"][ "controlled_service_data_backup_blocker_count" ] - == 8 + == 4 ) assert payload["mainline_execution_state"][ "controlled_service_data_backup_blocking_fields" ] == [ "service_green", "post_start_blocked", - "product_data_green", "backup_core_green", - "host_188_service_green", "wazuh_dashboard_degraded", - "stockplatform_freshness_status", - "stockplatform_ingestion_status", ] assert ( payload["mainline_execution_state"][ @@ -128,20 +124,36 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): ] is False ) + assert ( + payload["mainline_execution_state"]["windows99_management_readback_present"] + is True + ) + assert ( + payload["mainline_execution_state"][ + "windows99_remote_execution_channel_ready" + ] + is False + ) + assert ( + payload["mainline_execution_state"]["windows99_ssh_batch_status"] + == "permission_denied" + ) + assert ( + payload["mainline_execution_state"]["windows99_rdp_console_reachable"] + is True + ) assert payload["next_execution_order"][0].startswith("P0-006:") in_progress = payload["in_progress_or_blocked_in_priority_order"][0] assert in_progress["workplan_id"] == "P0-006" assert in_progress["status"] == "blocked_reboot_auto_recovery_slo_not_ready" assert in_progress["evidence"]["service_green"] is False - assert in_progress["evidence"]["product_data_green"] is False + assert in_progress["evidence"]["product_data_green"] is True assert in_progress["evidence"]["backup_core_green"] is False - assert in_progress["evidence"]["host_188_service_green"] is False - assert in_progress["evidence"]["stock_freshness_status"] == "not_configured" - assert in_progress["evidence"]["stock_ingestion_status"] == "not_configured" - assert in_progress["evidence"]["stock_blockers"] == ["postgres_not_ready"] - assert in_progress["evidence"]["stock_ingestion_blockers"] == [ - "postgres_not_ready" - ] + assert in_progress["evidence"]["host_188_service_green"] is True + assert in_progress["evidence"]["stock_freshness_status"] == "ok" + assert in_progress["evidence"]["stock_ingestion_status"] == "ok" + assert in_progress["evidence"]["stock_blockers"] == [] + assert in_progress["evidence"]["stock_ingestion_blockers"] == [] assert ( in_progress["evidence"][ "controlled_service_data_backup_readback_present" @@ -153,18 +165,14 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): ] == "blocked_service_data_backup_readback_not_green" assert in_progress["evidence"][ "controlled_service_data_backup_blocker_count" - ] == 8 + ] == 4 assert in_progress["evidence"][ "controlled_service_data_backup_blocking_fields" ] == [ "service_green", "post_start_blocked", - "product_data_green", "backup_core_green", - "host_188_service_green", "wazuh_dashboard_degraded", - "stockplatform_freshness_status", - "stockplatform_ingestion_status", ] assert in_progress["evidence"]["windows99_verify_collection_status"] == ( "blocked_windows99_verify_output_missing_host_reachable" @@ -187,6 +195,16 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): in_progress["evidence"]["windows99_verify_collection_host99_uptime_known"] is False ) + assert in_progress["evidence"]["windows99_management_readback_present"] is True + assert in_progress["evidence"]["windows99_management_host_reachable"] is True + assert ( + in_progress["evidence"]["windows99_remote_execution_channel_ready"] + is False + ) + assert in_progress["evidence"]["windows99_ssh_batch_status"] == ( + "permission_denied" + ) + assert in_progress["evidence"]["windows99_rdp_console_reachable"] is True assert ( in_progress["evidence"][ "controlled_service_data_backup_can_clear_blockers" @@ -194,7 +212,7 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): is False ) assert in_progress["evidence"]["drill_preflight_ready"] is False - assert in_progress["evidence"]["drill_preflight_blocker_count"] == 9 + assert in_progress["evidence"]["drill_preflight_blocker_count"] == 5 assert ( in_progress["evidence"][ "drill_preflight_execution_authorized_by_this_endpoint" @@ -210,6 +228,9 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): assert "windows99_verify_collection packet" in in_progress[ "professional_fix" ]["action"] + assert "windows99_management_channel visible" in in_progress[ + "professional_fix" + ]["action"] assert "production migration/control-channel" in in_progress[ "professional_fix" ]["action"] @@ -222,12 +243,12 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): payload["rollups"]["controlled_service_data_backup_readback_present"] is True ) - assert payload["rollups"]["controlled_service_data_backup_blocker_count"] == 8 + assert payload["rollups"]["controlled_service_data_backup_blocker_count"] == 4 assert ( payload["summary"]["controlled_service_data_backup_readback_present"] is True ) - assert payload["summary"]["controlled_service_data_backup_blocker_count"] == 8 + assert payload["summary"]["controlled_service_data_backup_blocker_count"] == 4 assert payload["summary"][ "controlled_service_data_backup_next_safe_action" ] == ( @@ -251,6 +272,15 @@ def test_awoooi_priority_work_order_readback_loader_returns_mainline_order(): payload["rollups"]["windows99_verify_collection_host99_reachable"] is True ) + assert payload["rollups"]["windows99_management_readback_present"] is True + assert payload["rollups"]["windows99_remote_execution_channel_ready"] is False + assert payload["rollups"]["windows99_ssh_batch_status"] == "permission_denied" + assert payload["rollups"]["windows99_rdp_console_reachable"] is True + assert payload["summary"]["windows99_management_readback_present"] is True + assert payload["summary"]["windows99_management_host_reachable"] is True + assert payload["summary"]["windows99_remote_execution_channel_ready"] is False + assert payload["summary"]["windows99_ssh_batch_status"] == "permission_denied" + assert payload["summary"]["windows99_rdp_console_reachable"] is True assert payload["operation_boundaries"]["github_api_used"] is False assert payload["operation_boundaries"]["github_cli_used"] is False assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 771234b3..43227b57 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -14,31 +14,30 @@ from src.services.reboot_auto_recovery_slo_scorecard import ( EXPECTED_REBOOT_SLO_BLOCKERS = [ "all_required_hosts_not_in_10_minute_reboot_window", "backup_core_green_not_1", - "host_188_service_green_not_1", "host_boot_observation_older_than_target_window", "host_unreachable_after_reboot", "host_uptime_unknown", - "local_disk_free_below_minimum", "post_start_blocked_not_zero", - "product_data_green_not_1", "reboot_event_required_host_unreachable", "service_green_not_1", "wazuh_dashboard_degraded", + "windows99_remote_execution_channel_unavailable", "windows99_vmware_autostart_readback_missing", ] EXPECTED_DRILL_PREFLIGHT_BLOCKERS = [ "service_green_not_ready", - "product_data_green_not_ready", "backup_core_green_not_ready", - "host_188_service_green_not_ready", "all_required_hosts_reachable_not_ready", "latest_verify_only_metric_present_not_ready", - "stockplatform_freshness_ok_not_ready", - "stockplatform_ingestion_ok_not_ready", "blocked_only_by_fresh_reboot_window_not_ready", ] +EXPECTED_WINDOWS99_NEXT_STEP = ( + "restore_windows99_no_secret_management_channel_or_collect_local_console_" + "verify_readback_then_rerun_reboot_scorecard_no_reboot" +) + def test_reboot_auto_recovery_slo_scorecard_loader_exposes_stockplatform_gate(): payload = load_latest_reboot_auto_recovery_slo_scorecard() @@ -84,18 +83,15 @@ def _assert_reboot_slo_payload(payload: dict): "event_and_probe" ) assert payload["primary_blocker"] == "reboot_event_required_host_unreachable" - assert payload["safe_next_step"] == ( - "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_" - "reboot_scorecard_no_secret_no_reboot" - ) + assert payload["safe_next_step"] == EXPECTED_WINDOWS99_NEXT_STEP assert payload["next_safe_action"] == payload["safe_next_step"] assert payload["can_claim_all_services_recovered_within_target"] is False - assert payload["active_blocker_count"] == 13 - assert payload["readiness_percent"] == 15 + assert payload["active_blocker_count"] == 11 + assert payload["readiness_percent"] == 43 assert payload["service_green"] is False - assert payload["product_data_green"] is False + assert payload["product_data_green"] is True assert payload["backup_core_green"] is False - assert payload["host_188_service_green"] is False + assert payload["host_188_service_green"] is True assert payload["observed_host_count"] == 7 assert payload["missing_host_count"] == 0 assert payload["unreachable_host_count"] == 1 @@ -106,13 +102,18 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["latest_verify_only_metric_blocker_count"] == 0 assert payload["latest_verify_only_metric_max_host_uptime_seconds"] == 0 assert payload["latest_verify_only_metric_last_run_timestamp"] == 0 - assert payload["stockplatform_freshness_status"] == "not_configured" - assert payload["stockplatform_ingestion_status"] == "not_configured" + assert payload["stockplatform_freshness_status"] == "ok" + assert payload["stockplatform_ingestion_status"] == "ok" assert payload["windows99_vmware_verify_ready"] is False assert payload["windows99_update_no_auto_reboot_ready"] is False + assert payload["windows99_remote_execution_channel_ready"] is False + assert payload["windows99_ssh_batch_status"] == "permission_denied" assert payload["source_controls"][ "host_110_startup_controlled_drain_guarded_autostart_source_present" ] is True + assert payload["source_controls"][ + "windows99_management_channel_probe_source_present" + ] is True assert payload["readback"]["workplan_id"] == "P0-006" assert payload["readback"]["target_minutes"] == 10 assert payload["readback"]["current_phase"] == "host_boot_detection_blocked" @@ -123,13 +124,10 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["readback"]["primary_blocker"] == ( "reboot_event_required_host_unreachable" ) - assert payload["readback"]["safe_next_step"] == ( - "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_" - "reboot_scorecard_no_secret_no_reboot" - ) + assert payload["readback"]["safe_next_step"] == EXPECTED_WINDOWS99_NEXT_STEP assert payload["readback"]["next_safe_action"] == payload["safe_next_step"] - assert payload["readback"]["active_blocker_count"] == 13 - assert payload["readback"]["readiness_percent"] == 15 + assert payload["readback"]["active_blocker_count"] == 11 + assert payload["readback"]["readiness_percent"] == 43 assert payload["readback"]["blocked_by_fresh_reboot_window_only"] is False assert payload["readback"]["latest_verify_only_metric_present"] is False assert payload["readback"]["windows99_vmware_verify_ready"] is False @@ -141,29 +139,31 @@ def _assert_reboot_slo_payload(payload: dict): payload["readback"]["windows99_verify_collection_can_collect_no_secret"] is True ) - assert payload["rollups"]["active_blocker_count"] == 13 - assert payload["rollups"]["readiness_percent"] == 15 + assert payload["readback"]["windows99_remote_execution_channel_ready"] is False + assert payload["readback"]["windows99_ssh_batch_status"] == "permission_denied" + assert payload["rollups"]["active_blocker_count"] == 11 + assert payload["rollups"]["readiness_percent"] == 43 assert payload["rollups"]["observed_host_count"] == 7 assert payload["rollups"]["missing_host_count"] == 0 assert payload["rollups"]["unreachable_host_count"] == 1 assert payload["rollups"]["stale_host_count"] == 5 assert payload["rollups"]["service_green"] is False - assert payload["rollups"]["product_data_green"] is False + assert payload["rollups"]["product_data_green"] is True assert payload["rollups"]["backup_core_green"] is False - assert payload["rollups"]["host_188_service_green"] is False + assert payload["rollups"]["host_188_service_green"] is True assert payload["rollups"]["blocked_by_fresh_reboot_window_only"] is False assert payload["rollups"]["latest_verify_only_metric_present"] is False assert payload["rollups"]["latest_verify_only_metric_ready"] == 0 assert payload["rollups"]["latest_verify_only_metric_blocker_count"] == 0 - assert payload["rollups"]["stockplatform_freshness_status"] == "not_configured" - assert payload["rollups"]["stockplatform_ingestion_status"] == "not_configured" - assert payload["rollups"]["stockplatform_freshness_blocker_count"] == 1 - assert payload["rollups"]["stockplatform_ingestion_blocker_count"] == 1 + assert payload["rollups"]["stockplatform_freshness_status"] == "ok" + assert payload["rollups"]["stockplatform_ingestion_status"] == "ok" + assert payload["rollups"]["stockplatform_freshness_blocker_count"] == 0 + assert payload["rollups"]["stockplatform_ingestion_blocker_count"] == 0 assert ( payload["rollups"]["controlled_service_data_backup_readback_present"] is True ) - assert payload["rollups"]["controlled_service_data_backup_blocker_count"] == 8 + assert payload["rollups"]["controlled_service_data_backup_blocker_count"] == 4 assert ( payload["rollups"]["controlled_service_data_backup_can_clear_blockers"] is False @@ -184,6 +184,11 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["rollups"]["windows99_verify_collection_blocker_count"] == 2 assert payload["rollups"]["windows99_host99_reachable"] is True assert payload["rollups"]["windows99_host99_uptime_known"] is False + assert payload["rollups"]["windows99_management_readback_present"] is True + assert payload["rollups"]["windows99_host_reachable"] is True + assert payload["rollups"]["windows99_remote_execution_channel_ready"] is False + assert payload["rollups"]["windows99_ssh_batch_status"] == "permission_denied" + assert payload["rollups"]["windows99_rdp_console_reachable"] is True assert payload["rollups"]["stockplatform_final_retry_window_passed"] is False assert ( payload["rollups"]["stockplatform_controlled_recovery_gate_required"] @@ -197,8 +202,8 @@ def _assert_reboot_slo_payload(payload: dict): "event_and_probe" ) assert progress["primary_blocker"] == "reboot_event_required_host_unreachable" - assert progress["active_blocker_count"] == 13 - assert progress["readiness_percent"] == 15 + assert progress["active_blocker_count"] == 11 + assert progress["readiness_percent"] == 43 assert progress["next_safe_action"] == payload["safe_next_step"] assert progress["fixed_triage_order"][0] == "99_vmware_autostart_and_vm_power" service_data_backup = payload["controlled_service_data_backup_readback"] @@ -210,28 +215,22 @@ def _assert_reboot_slo_payload(payload: dict): ) assert service_data_backup["readback_present"] is True assert service_data_backup["service_green"] is False - assert service_data_backup["product_data_green"] is False + assert service_data_backup["product_data_green"] is True assert service_data_backup["backup_core_green"] is False - assert service_data_backup["host_188_service_green"] is False - assert service_data_backup["post_start_blocked"] == 8 + assert service_data_backup["host_188_service_green"] is True + assert service_data_backup["post_start_blocked"] == 1 assert service_data_backup["wazuh_dashboard_degraded"] is True - assert service_data_backup["stockplatform_freshness_status"] == "not_configured" - assert service_data_backup["stockplatform_ingestion_status"] == "not_configured" + assert service_data_backup["stockplatform_freshness_status"] == "ok" + assert service_data_backup["stockplatform_ingestion_status"] == "ok" assert service_data_backup["blocking_fields"] == [ "service_green", "post_start_blocked", - "product_data_green", "backup_core_green", - "host_188_service_green", "wazuh_dashboard_degraded", - "stockplatform_freshness_status", - "stockplatform_ingestion_status", ] assert service_data_backup["related_active_blockers"] == [ "backup_core_green_not_1", - "host_188_service_green_not_1", "post_start_blocked_not_zero", - "product_data_green_not_1", "service_green_not_1", "wazuh_dashboard_degraded", ] @@ -265,19 +264,25 @@ def _assert_reboot_slo_payload(payload: dict): assert "-Mode Verify" in collection["no_secret_verify_command"] assert "host_reboot" in collection["forbidden_actions"] assert "windows_password_or_secret_collection" in collection["forbidden_actions"] + windows99_management = payload["windows99_management_channel"] + assert windows99_management["readback_present"] is True + assert windows99_management["host_reachable"] is True + assert windows99_management["remote_execution_channel_ready"] is False + assert windows99_management["ssh_batch"]["status"] == "permission_denied" + assert windows99_management["rdp_console_reachable"] is True stockplatform = payload["stockplatform_data_freshness"] assert stockplatform["freshness_endpoint_readback_present"] is True assert stockplatform["ingestion_endpoint_readback_present"] is True - assert stockplatform["freshness_blockers"] == ["postgres_not_ready"] - assert stockplatform["ingestion_blockers"] == ["postgres_not_ready"] + assert stockplatform["freshness_blockers"] == [] + assert stockplatform["ingestion_blockers"] == [] assert stockplatform["margin_short_recovery"]["status"] == "not_verified" assert stockplatform["margin_short_recovery"]["successful_source_run_ids"] == [] - assert stockplatform["ai_recommendations_recovery"]["status"] == "not_verified" + assert stockplatform["ai_recommendations_recovery"]["status"] == "recovered" assert stockplatform["eod_window"]["final_retry_window_passed"] is False assert stockplatform["controlled_recovery_gate"]["required"] is False assert ( stockplatform["controlled_recovery_gate"]["status"] - == "not_required_yet" + == "not_required_freshness_recovered" ) assert "manual_db_update" in stockplatform["controlled_recovery_gate"][ "forbidden_actions" @@ -299,7 +304,7 @@ def _assert_drill_preflight_payload(payload: dict): assert payload["priority"] == "P0-006" assert payload["status"] == "blocked_reboot_drill_preflight_not_ready" assert payload["preflight_ready"] is False - assert payload["preflight_blocker_count"] == 9 + assert payload["preflight_blocker_count"] == 5 assert payload["active_blockers"] == EXPECTED_DRILL_PREFLIGHT_BLOCKERS assert payload["break_glass_authorization_required"] is True assert payload["execution_authorized_by_this_endpoint"] is False @@ -338,25 +343,25 @@ def _assert_drill_preflight_payload(payload: dict): assert payload["preconditions"] == { "source_controls_present": True, "service_green": False, - "product_data_green": False, + "product_data_green": True, "backup_core_green": False, - "host_188_service_green": False, + "host_188_service_green": True, "all_required_hosts_observed": True, "all_required_hosts_reachable": False, "latest_verify_only_metric_present": False, - "stockplatform_freshness_ok": False, - "stockplatform_ingestion_ok": False, + "stockplatform_freshness_ok": True, + "stockplatform_ingestion_ok": True, "blocked_only_by_fresh_reboot_window": False, } current = payload["current_readback"] assert current["scorecard_status"] == "blocked_reboot_auto_recovery_slo_not_ready" - assert current["readiness_percent"] == 15 - assert current["active_blocker_count"] == 13 + assert current["readiness_percent"] == 43 + assert current["active_blocker_count"] == 11 assert current["active_blockers"] == EXPECTED_REBOOT_SLO_BLOCKERS assert current["latest_verify_only_metric_ready"] == 0 assert current["latest_verify_only_metric_blocker_count"] == 0 - assert current["stockplatform_freshness_status"] == "not_configured" - assert current["stockplatform_ingestion_status"] == "not_configured" + assert current["stockplatform_freshness_status"] == "ok" + assert current["stockplatform_ingestion_status"] == "ok" check_mode = payload["check_mode"] assert check_mode["verify_only_available"] is True assert ( @@ -376,12 +381,12 @@ def _assert_drill_preflight_payload(payload: dict): assert readback["workplan_id"] == "P0-006" assert readback["preflight_ready"] is False assert readback["target_selector"]["required_host_count"] == 7 - assert readback["current_readback"]["active_blocker_count"] == 13 + assert readback["current_readback"]["active_blocker_count"] == 11 assert readback["safe_next_step"] == payload["safe_next_step"] rollups = payload["rollups"] assert rollups["preflight_ready"] is False assert rollups["preflight_ready_count"] == 0 - assert rollups["preflight_blocker_count"] == 9 + assert rollups["preflight_blocker_count"] == 5 assert rollups["target_required_host_count"] == 7 assert rollups["target_observed_host_count"] == 7 assert rollups["target_unreachable_host_count"] == 1 diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 94882dbd..e6b5c3c4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -16,6 +16,23 @@ **仍維持**: - 未執行 Windows / VMware console;未讀密碼、token、secret、`.env`、raw sessions / SQLite / auth;未重啟主機 / VM / Docker / Nginx / K3s / DB / firewall;未觸發 workflow;未使用 GitHub / `gh` / GitHub API。 +## 2026-07-02 — 15:08 P0-006 Windows 99 management channel 進入 reboot SLO + +**完成內容**: +- 新增 `scripts/reboot-recovery/windows99-management-channel-probe.py`:no-secret 讀回 99 的 ping / TCP `22 / 135 / 445 / 3389 / 5985 / 5986`、SSH BatchMode publickey、WinRM port 與 RDP console reachability;不讀 Windows 密碼、不啟動 VM、不重啟、不改 Windows Update。 +- `scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py` 新增 `--windows99-management-file`,當 99 VMware verifier output 缺失且 management channel 不可用時,active blocker 會明確加入 `windows99_remote_execution_channel_unavailable`,next action 固定為 `restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot`。 +- `/api/v1/agents/reboot-auto-recovery-slo-scorecard` 讀回新增 `windows99_management_channel`、`windows99_remote_execution_channel_ready`、`windows99_ssh_batch_status` 與 rollups;`reboot-auto-recovery-slo-exporter.sh` 也會自動產生 management-channel artifact 並餵進 scorecard。 +- 2026-07-02 15:08 live artifact `/tmp/awoooi-reboot-slo-20260702-150844` 已寫回 `docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json`:readiness `43%`、active blockers `11`。StockPlatform freshness / ingestion 已回 `ok`、latest trading date `2026-07-01`;188 service green;仍不可宣稱 10 分鐘自恢復完成,因為 111 unreachable、99 uptime unknown、99 VMware verifier missing、99 remote execution channel unavailable、backup core / Wazuh 仍 blocking。 +- `FULL-STACK-COLD-START-SOP.md` 升 v1.95,`docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md` 同步 P0-RBT-001/002/003/004/008/009 狀態。 + +**驗證**: +- `python3.11 -m py_compile scripts/reboot-recovery/windows99-management-channel-probe.py scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py apps/api/src/services/reboot_auto_recovery_slo_scorecard.py`:通過。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost/test python3.11 -m pytest scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py -q`:`15 passed`。 +- Live 99 management probe:`host_reachable=true`、`rdp_console_reachable=true`、`ssh_batch.status=permission_denied`、`winrm_http_open=false`、`winrm_https_open=false`、`remote_execution_channel_ready=false`。 + +**仍維持**: +- 未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未觸發 workflow;未重啟主機 / VM / Docker / Nginx / K3s / DB / firewall;未啟動 VMware VM;未寫 production DB。 + ## 2026-07-02 — 14:46 CD controlled-runtime classifier 補上 Windows 99 VMware verifier source **完成內容**: diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index d480ef5b..d90e47c1 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -2,27 +2,25 @@ "active_blockers": [ "all_required_hosts_not_in_10_minute_reboot_window", "backup_core_green_not_1", - "host_188_service_green_not_1", "host_boot_observation_older_than_target_window", "host_unreachable_after_reboot", "host_uptime_unknown", - "local_disk_free_below_minimum", "post_start_blocked_not_zero", - "product_data_green_not_1", "reboot_event_required_host_unreachable", "service_green_not_1", "wazuh_dashboard_degraded", + "windows99_remote_execution_channel_unavailable", "windows99_vmware_autostart_readback_missing" ], "can_claim_all_services_recovered_within_target": false, "capacity": { "checked": true, - "free_gib": 0.751, + "free_gib": 4.021, "min_free_gib": 2.0 }, "current_phase": "host_boot_detection_blocked", "eta_or_wait_reason": "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe", - "generated_at": "2026-06-30T20:44:37+08:00", + "generated_at": "2026-07-02T15:08:44+08:00", "host_boot_detection": { "host_rows": [ { @@ -38,14 +36,14 @@ }, { "alias": "110", - "boot_id": "node_exporter_1782811060", + "boot_id": "2b07b8f1-d818-48be-a4bc-a05aedcca658", "reachable": true, - "startup_active": "unknown", - "startup_enabled": "unknown", + "startup_active": "inactive", + "startup_enabled": "enabled", "startup_unit": "awoooi-startup-110.service", - "systemd_state": "node_exporter", + "systemd_state": "running", "target": "wooo@192.168.0.110", - "uptime_seconds": 12406 + "uptime_seconds": 86200 }, { "alias": "111", @@ -67,7 +65,7 @@ "startup_unit": "vm-host-boot", "systemd_state": "node_exporter", "target": "192.168.0.112", - "uptime_seconds": 12484 + "uptime_seconds": 165145 }, { "alias": "120", @@ -78,7 +76,7 @@ "startup_unit": "k3s.service", "systemd_state": "running", "target": "wooo@192.168.0.120", - "uptime_seconds": 12479 + "uptime_seconds": 165139 }, { "alias": "121", @@ -89,7 +87,7 @@ "startup_unit": "k3s.service", "systemd_state": "running", "target": "wooo@192.168.0.121", - "uptime_seconds": 12480 + "uptime_seconds": 165140 }, { "alias": "188", @@ -100,10 +98,10 @@ "startup_unit": "awoooi-startup.service", "systemd_state": "degraded", "target": "ollama@192.168.0.188", - "uptime_seconds": 12420 + "uptime_seconds": 165079 } ], - "max_observed_uptime_seconds": 12484, + "max_observed_uptime_seconds": 165145, "missing_hosts": [], "observed_hosts": [ "110", @@ -162,67 +160,70 @@ "rerun_host_pressure_and_cold_start_scorecard_after_apply" ] }, - "next_safe_action": "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot", + "next_safe_action": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "post_reboot_readiness": { "backup_core_green": false, - "host_188_service_green": false, - "next_required_gates": "product_data_freshness_recovery,backup_core_readback_recovery,credential_escrow_evidence,host_188_hygiene_maintenance_window", + "host_188_service_green": true, + "next_required_gates": "backup_core_readback_recovery,credential_escrow_evidence,host_188_hygiene_maintenance_window", "overall_declaration": "SERVICE_BLOCKED", - "post_start_blocked": 8, + "post_start_blocked": 1, "post_start_result": "BLOCKED", - "product_data_green": false, + "product_data_green": true, "service_green": false, "summary_present": true, "wazuh_dashboard_degraded": true }, "primary_blocker": "reboot_event_required_host_unreachable", "readback": { - "active_blocker_count": 13, + "active_blocker_count": 11, "active_blockers": [ "all_required_hosts_not_in_10_minute_reboot_window", "backup_core_green_not_1", - "host_188_service_green_not_1", "host_boot_observation_older_than_target_window", "host_unreachable_after_reboot", "host_uptime_unknown", - "local_disk_free_below_minimum", "post_start_blocked_not_zero", - "product_data_green_not_1", "reboot_event_required_host_unreachable", "service_green_not_1", "wazuh_dashboard_degraded", + "windows99_remote_execution_channel_unavailable", "windows99_vmware_autostart_readback_missing" ], "blocked_by_fresh_reboot_window_only": false, "current_phase": "host_boot_detection_blocked", "eta_or_wait_reason": "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe", "host_reboot_authorized_by_this_scorecard": false, - "next_safe_action": "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot", + "next_safe_action": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "primary_blocker": "reboot_event_required_host_unreachable", - "readiness_percent": 15, + "readiness_percent": 43, "required_checks": { "backup_core_green": false, "can_claim_slo": false, "fresh_reboot_window_observed": false, - "host_188_service_green": false, - "product_data_green": false, + "host_188_service_green": true, + "product_data_green": true, "required_hosts_observed": true, "required_hosts_reachable": false, "service_green": false, "source_controls_present": true, - "stockplatform_freshness_ok": false, - "stockplatform_ingestion_ok": false, + "stockplatform_freshness_ok": true, + "stockplatform_ingestion_ok": true, + "windows99_management_channel_ready": false, "windows99_vmware_autostart_ready": false, "windows_update_no_auto_reboot_ready": false }, "runtime_write_authorized_by_this_scorecard": false, - "safe_next_step": "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot", + "safe_next_step": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "secret_value_collection_allowed": false, "source_controls_present": true, "source_id": "reboot_auto_recovery_slo_scorecard", "status": "blocked_reboot_auto_recovery_slo_not_ready", "target_minutes": 10, + "windows99_remote_execution_channel_ready": false, + "windows99_ssh_batch_status": "permission_denied", "windows99_update_no_auto_reboot_ready": false, + "windows99_verify_collection_can_collect_no_secret": true, + "windows99_verify_collection_status": "blocked_windows99_verify_output_missing_host_reachable", "windows99_vmware_verify_ready": false, "workflow_trigger_authorized_by_this_scorecard": false, "workplan_id": "P0-006", @@ -241,27 +242,25 @@ "rebooted_hosts": [ "99" ], - "state_file": "/tmp/awoooi-reboot-slo-live-20260630-2045/reboot-event-state.json", - "state_written": false, + "state_file": "/tmp/awoooi-reboot-slo-20260702-150844/reboot-event-state.json", + "state_written": true, "unreachable_hosts": [ "111" ] }, "reboot_sop_progress": { - "active_blocker_count": 13, + "active_blocker_count": 11, "active_blockers": [ "all_required_hosts_not_in_10_minute_reboot_window", "backup_core_green_not_1", - "host_188_service_green_not_1", "host_boot_observation_older_than_target_window", "host_unreachable_after_reboot", "host_uptime_unknown", - "local_disk_free_below_minimum", "post_start_blocked_not_zero", - "product_data_green_not_1", "reboot_event_required_host_unreachable", "service_green_not_1", "wazuh_dashboard_degraded", + "windows99_remote_execution_channel_unavailable", "windows99_vmware_autostart_readback_missing" ], "current_phase": "host_boot_detection_blocked", @@ -275,36 +274,37 @@ "backup_health_and_offsite_evidence", "telegram_alert_delivery_readback" ], - "next_safe_action": "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot", + "next_safe_action": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "primary_blocker": "reboot_event_required_host_unreachable", - "readiness_percent": 15 + "readiness_percent": 43 }, "required_checks": { "backup_core_green": false, "can_claim_slo": false, "fresh_reboot_window_observed": false, - "host_188_service_green": false, - "product_data_green": false, + "host_188_service_green": true, + "product_data_green": true, "required_hosts_observed": true, "required_hosts_reachable": false, "service_green": false, "source_controls_present": true, - "stockplatform_freshness_ok": false, - "stockplatform_ingestion_ok": false, + "stockplatform_freshness_ok": true, + "stockplatform_ingestion_ok": true, + "windows99_management_channel_ready": false, "windows99_vmware_autostart_ready": false, "windows_update_no_auto_reboot_ready": false }, "rollups": { - "active_blocker_count": 13, + "active_blocker_count": 11, "backup_core_green": false, "blocked_by_fresh_reboot_window_only": false, "can_claim_all_services_recovered_within_target": false, - "capacity_below_minimum": true, + "capacity_below_minimum": false, "capacity_checked": true, - "capacity_free_gib": 0.751, + "capacity_free_gib": 4.021, "capacity_min_free_gib": 2.0, - "completed_check_count": 2, - "host_188_service_green": false, + "completed_check_count": 6, + "host_188_service_green": true, "latest_verify_only_metric_blocker_count": 0, "latest_verify_only_metric_last_run_timestamp": 0, "latest_verify_only_metric_max_host_uptime_seconds": 0, @@ -312,31 +312,45 @@ "latest_verify_only_metric_ready": 0, "missing_host_count": 0, "observed_host_count": 7, - "post_start_blocked": 8, - "product_data_green": false, - "readiness_percent": 15, - "required_check_count": 13, + "post_start_blocked": 1, + "product_data_green": true, + "readiness_percent": 43, + "required_check_count": 14, "service_green": false, - "source_control_count": 15, - "source_control_ready_count": 15, + "source_control_count": 16, + "source_control_ready_count": 16, "source_controls_present": true, "stale_host_count": 5, - "stockplatform_freshness_blocker_count": 1, - "stockplatform_freshness_status": "not_configured", - "stockplatform_ingestion_blocker_count": 1, - "stockplatform_ingestion_status": "not_configured", + "stockplatform_freshness_blocker_count": 0, + "stockplatform_freshness_status": "ok", + "stockplatform_ingestion_blocker_count": 0, + "stockplatform_ingestion_status": "ok", "unknown_uptime_host_count": 1, "unreachable_host_count": 1, + "windows99_can_collect_vmware_verify_without_secret": false, + "windows99_host99_reachable": true, + "windows99_host99_uptime_known": false, + "windows99_host_reachable": true, + "windows99_management_readback_present": true, + "windows99_rdp_console_reachable": true, + "windows99_remote_execution_channel_ready": false, + "windows99_ssh_batch_ready": false, + "windows99_ssh_batch_status": "permission_denied", "windows99_update_no_auto_reboot_ready": false, + "windows99_verify_collection_blocker_count": 2, + "windows99_verify_collection_can_collect_no_secret": true, + "windows99_verify_collection_status": "blocked_windows99_verify_output_missing_host_reachable", "windows99_vmrun_present": false, "windows99_vmware_config_ready": false, "windows99_vmware_missing_vmx_count": 5, "windows99_vmware_power_ready": false, "windows99_vmware_powered_off_count": 5, "windows99_vmware_readback_present": false, - "windows99_vmware_verify_ready": false + "windows99_vmware_verify_ready": false, + "windows99_winrm_http_open": false, + "windows99_winrm_https_open": false }, - "safe_next_step": "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot", + "safe_next_step": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1", "sla_recovery_eta": { "deadline_status": "target_window_elapsed", @@ -369,23 +383,16 @@ "slo_systemd_service_source_present": true, "slo_systemd_timer_source_present": true, "telegram_reboot_backup_alert_rules_source_present": true, + "windows99_management_channel_probe_source_present": true, "windows_99_vmware_autostart_source_present": true }, "status": "blocked_reboot_auto_recovery_slo_not_ready", "stockplatform_data_freshness": { "ai_recommendations_recovery": { "cleared_blocker": "ai_recommendations_stale", - "status": "not_verified" + "status": "recovered" }, - "blocked_sources": [ - { - "latest_date": null, - "notes": "PostgreSQL 結構尚未就緒:OperationalError", - "row_count": 0, - "source": "postgres", - "status": "not_configured" - } - ], + "blocked_sources": [], "controlled_recovery_gate": { "allowed_actions": [ "inspect_existing_ingestion_readback", @@ -401,29 +408,106 @@ "reboot_or_service_restart_from_reboot_slo_lane" ], "required": false, - "status": "not_required_yet", + "status": "not_required_freshness_recovered", "target_selector": "stockplatform-v2:system_freshness:core.margin_short_daily,ai.recommendations" }, "eod_window": { - "classification": "unknown", + "classification": "ok", "final_retry_window_end_local": "unknown", "final_retry_window_passed": false, "first_full_window_end_local": "unknown", - "next_action": "unknown", + "next_action": "none", "pending": false }, - "freshness_blockers": [ - "postgres_not_ready" - ], + "freshness_blockers": [], "freshness_endpoint_readback_present": true, - "freshness_status": "not_configured", - "ingestion_blockers": [ - "postgres_not_ready" - ], + "freshness_status": "ok", + "ingestion_blockers": [], "ingestion_endpoint_readback_present": true, - "ingestion_status": "not_configured", - "latest_source_runs": [], - "latest_trading_date": "None", + "ingestion_status": "ok", + "latest_source_runs": [ + { + "finished_at": "2026-07-02T07:00:09.686543Z", + "source_name": "intelligence_security_linker", + "source_run_id": 3508, + "started_at": "2026-07-02T07:00:09.686543Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-02T07:00:07.375654Z", + "source_name": "intelligence_reports_import", + "source_run_id": 3507, + "started_at": "2026-07-02T07:00:07.375654Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-02T05:00:09.044775Z", + "source_name": "intelligence_security_linker", + "source_run_id": 3506, + "started_at": "2026-07-02T05:00:09.044775Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-02T05:00:07.871117Z", + "source_name": "intelligence_reports_import", + "source_run_id": 3505, + "started_at": "2026-07-02T05:00:07.871117Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-02T03:00:09.959576Z", + "source_name": "intelligence_security_linker", + "source_run_id": 3504, + "started_at": "2026-07-02T03:00:09.959576Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-02T03:00:07.617332Z", + "source_name": "intelligence_reports_import", + "source_run_id": 3503, + "started_at": "2026-07-02T03:00:07.617332Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-02T01:00:13.407344Z", + "source_name": "intelligence_security_linker", + "source_run_id": 3502, + "started_at": "2026-07-02T01:00:13.407344Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-02T01:00:11.723462Z", + "source_name": "intelligence_reports_import", + "source_run_id": 3501, + "started_at": "2026-07-02T01:00:11.723462Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-01T23:00:10.305951Z", + "source_name": "intelligence_security_linker", + "source_run_id": 3500, + "started_at": "2026-07-01T23:00:10.305951Z", + "status": "succeeded", + "target_date": null + }, + { + "finished_at": "2026-07-01T23:00:09.148428Z", + "source_name": "intelligence_reports_import", + "source_run_id": 3499, + "started_at": "2026-07-01T23:00:09.148428Z", + "status": "succeeded", + "target_date": null + } + ], + "latest_trading_date": "2026-07-01", "margin_short_recovery": { "cleared_blocker": "core_margin_short_daily_missing", "cleared_ingestion_blocker": "core.margin_short_daily_incomplete", @@ -433,25 +517,27 @@ }, "summary": { "github_api_used": false, - "reboot_auto_recovery_active_blocker_count": 13, + "reboot_auto_recovery_active_blocker_count": 11, "reboot_auto_recovery_backup_core_green": false, "reboot_auto_recovery_can_claim_slo": false, "reboot_auto_recovery_current_phase": "host_boot_detection_blocked", "reboot_auto_recovery_eta_or_wait_reason": "target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_event_and_probe", - "reboot_auto_recovery_host_188_service_green": false, - "reboot_auto_recovery_next_safe_action": "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot", + "reboot_auto_recovery_host_188_service_green": true, + "reboot_auto_recovery_next_safe_action": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "reboot_auto_recovery_observed_host_count": 7, "reboot_auto_recovery_primary_blocker": "reboot_event_required_host_unreachable", - "reboot_auto_recovery_product_data_green": false, - "reboot_auto_recovery_readiness_percent": 15, - "reboot_auto_recovery_safe_next_step": "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot", + "reboot_auto_recovery_product_data_green": true, + "reboot_auto_recovery_readiness_percent": 43, + "reboot_auto_recovery_safe_next_step": "restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot", "reboot_auto_recovery_service_green": false, "reboot_auto_recovery_source_controls_present": true, "reboot_auto_recovery_stale_host_count": 5, "reboot_auto_recovery_status": "blocked_reboot_auto_recovery_slo_not_ready", - "reboot_auto_recovery_stockplatform_freshness_status": "not_configured", - "reboot_auto_recovery_stockplatform_ingestion_status": "not_configured", + "reboot_auto_recovery_stockplatform_freshness_status": "ok", + "reboot_auto_recovery_stockplatform_ingestion_status": "ok", "reboot_auto_recovery_windows99_update_no_auto_reboot_ready": false, + "reboot_auto_recovery_windows99_verify_collection_can_collect_no_secret": true, + "reboot_auto_recovery_windows99_verify_collection_status": "blocked_windows99_verify_output_missing_host_reachable", "reboot_auto_recovery_windows99_vmware_verify_ready": false, "reboot_auto_recovery_workplan_id": "P0-006", "runtime_write_authorized": false, @@ -460,6 +546,95 @@ }, "target_minutes": 10, "target_seconds": 600, + "windows99_management_channel": { + "blockers": [ + "windows99_remote_execution_channel_unavailable", + "windows99_winrm_unavailable", + "windows99_ssh_batch_denied" + ], + "can_collect_vmware_verify_without_secret": false, + "forbidden_actions": [ + "read_windows_password", + "read_secret_value", + "start_vm", + "reboot_host", + "restart_service", + "write_windows_policy" + ], + "generated_at": "2026-07-02T15:08:44+08:00", + "host": "192.168.0.99", + "host_reachable": true, + "rdp_console_reachable": true, + "readback_present": true, + "remote_execution_channel_ready": false, + "schema_version": "windows99_management_channel_readback_v1", + "ssh_batch": { + "checked": true, + "ready": false, + "status": "permission_denied" + }, + "ssh_user": "administrator", + "tcp_ports": { + "135": "open", + "22": "open", + "3389": "open", + "445": "open", + "5985": "timeout", + "5986": "timeout" + }, + "winrm_http_open": false, + "winrm_https_open": false + }, + "windows99_verify_collection": { + "can_collect_no_secret_verify": true, + "collection_blockers": [ + "windows99_vmware_autostart_readback_missing", + "windows99_uptime_unknown" + ], + "expected_no_secret_output_fields": [ + "VMRUN_PRESENT", + "VMX alias= present=<0|1>", + "VMWARE_SERVICE name= ok=<0|1>", + "VMWARE_AUTOSTART_TASK name=AWOOOI-Start-VMware-VMs ok=<0|1>", + "WINDOWS_UPDATE_POLICY name= ok=<0|1>", + "VM_POWER alias= running=<0|1>", + "VMWARE_AUTOSTART_CONFIG_READY", + "VMWARE_AUTOSTART_POWER_READY", + "WINDOWS_UPDATE_NO_AUTO_REBOOT_READY", + "VMWARE_AUTOSTART_VERIFY_READY" + ], + "forbidden_actions": [ + "windows_password_or_secret_collection", + "host_reboot", + "vm_power_change", + "windows_update_policy_apply", + "manual_registry_edit", + "service_restart", + "github_api" + ], + "host99_reachable": true, + "host99_uptime_known": false, + "no_secret_verify_command": "powershell -ExecutionPolicy Bypass -File .\\windows99-vmware-autostart.ps1 -Mode Verify", + "post_verifier": "rerun_reboot_auto_recovery_slo_scorecard_with_windows99_vmware_file_no_secret_no_reboot", + "readback_present": false, + "required_vm_aliases": [ + "111", + "112", + "120", + "121", + "188" + ], + "safe_collection_channels": [ + "authorized_windows99_console_verify_stdout_only", + "existing_management_channel_verify_mode_only", + "committed_no_secret_artifact_file_then_scorecard_rerun" + ], + "schema_version": "windows99_vmware_verify_collection_packet_v1", + "status": "blocked_windows99_verify_output_missing_host_reachable", + "target_host": "192.168.0.99", + "target_host_alias": "99", + "verify_ready": false + }, "windows99_vmware_autostart": { "blockers": [ "windows99_vmware_autostart_readback_missing" diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 361d2f21..526d7dd9 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.94 +> Version: v1.95 > Last updated: 2026-07-02 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -22,6 +22,8 @@ v1.93 reboot SLA readback fixed fields rule:`scripts/reboot-recovery/reboot-au v1.94 Windows 99 / VMware verifier rule:99 主機與 guest VM 自動啟動不得再只靠「ping 到 99」或「VMX source 存在」宣稱。`scripts/reboot-recovery/windows99-vmware-autostart.ps1 -Mode Verify` 必須輸出 no-secret 固定欄位:`VMRUN_PRESENT`、`VMX alias=... present=...`、`VMWARE_SERVICE ... ok=...`、`VMWARE_AUTOSTART_TASK ... ok=...`、`WINDOWS_UPDATE_POLICY ... ok=...`、`VM_POWER alias=... running=...`、`VMWARE_AUTOSTART_CONFIG_READY`、`VMWARE_AUTOSTART_POWER_READY`、`WINDOWS_UPDATE_NO_AUTO_REBOOT_READY`、`VMWARE_AUTOSTART_VERIFY_READY`。`reboot-auto-recovery-slo-scorecard.py --windows99-vmware-file ` 必須解析這些欄位;缺 readback 時 active blocker 固定為 `windows99_vmware_autostart_readback_missing`,next action 固定為 `collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_reboot_scorecard_no_secret_no_reboot`。預設 required guest VM alias 為 `111 / 188 / 120 / 121 / 112`;`110` 不再被 99 VMware autostart 預設清單替代。此 verifier 不讀 Windows 密碼、不讀 secret、不啟動 VM、不重啟 host;`Apply` 仍需獨立 controlled apply 與 post-verifier。 +v1.95 Windows 99 management-channel readback rule:若 99 可 ping / RDP / TCP,但 `windows99-vmware-autostart.ps1 -Mode Verify` 尚未收回,不得只寫成「等 verifier」。必須先跑 `scripts/reboot-recovery/windows99-management-channel-probe.py --output `,並把結果用 `reboot-auto-recovery-slo-scorecard.py --windows99-management-file ` 接入同一份 SLO scorecard。此 probe 只做 no-secret readback:TCP `22 / 135 / 445 / 3389 / 5985 / 5986`、SSH BatchMode publickey、WinRM port、RDP console reachability;不得讀 Windows 密碼、不得啟動 VM、不得重啟、不得改 Windows Update。2026-07-02 15:08 live readback:`host_reachable=true`、`rdp_console_reachable=true`、`ssh_batch.status=permission_denied`、`winrm_http_open=false`、`winrm_https_open=false`、`remote_execution_channel_ready=false`;因此 active blockers 必須包含 `windows99_remote_execution_channel_unavailable` 與 `windows99_vmware_autostart_readback_missing`,next action 固定為 `restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot`。 + 2026-07-02 110 control-path / Harbor recovery receipt rule:若 Gitea Harbor repair queue 仍保留 `harbor_110_remote_ssh_publickey_auth_stalled`、remote-control unavailable、jobs stale 或 historical failure,但同一輪本地證據同時證明 `wooo` command path ready、110 local Harbor `/v2/` ready、public/internal registry `/v2/` 回 `401`,則該 Gitea Harbor repair 失敗只能列為 historical queue metadata,不得再當成 current SSH blocker。必須用 `/api/v1/agents/harbor-registry-controlled-recovery-receipt` 或同等 validator 合併 `diagnose-110-ssh-publickey-auth.sh`、`recover-110-control-path-and-harbor-local.sh --check`、public Gitea queue readback 與 registry `/v2/` verifier,並把機器可讀結果寫入 `docs/operations/harbor-110-control-path-recovery-readback-2026-07-02.snapshot.json` 類型的 snapshot。2026-07-02 live receipt 顯示:public/internal registry `/v2/` 均為 `401`、latest visible CD `#4335` 為 `Success`、Gitea Harbor repair failure 已是 `historical_after_latest_cd_success=true`;active blockers 收斂為 110 controlled CD lane config / binary / registration / service guardrail、active action container pressure,以及 Gitea CD jobs head-SHA / stale readback mismatch。若 local-console output 只有 `AWOOOI_110_CONTROLLED_CD_LANE_READY` marker,non110 runner parser 不得從 110 `BLOCKER` 行推導 non110 blocker;non110 只有看到 `AWOOOI_NON110_RUNNER_READY` marker 才能列入 active blocker。 2026-07-02 110 controlled CD lane fail-closed enforcer staging rule:110 runner 壓力事故後,legacy / generic runner 仍必須 fail-closed;但 `awoooi-cd-lane-drain.service` 的非 secret staging artifact 不得再被 enforcer 無差別封回 stub。`scripts/reboot-recovery/enforce-110-runner-failclosed.sh` 只有在 `config.yaml` 符合 `capacity <= 1`、只含 `awoooi-host:host` 與 `awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04`、binary 是 executable ELF、systemd unit 具備 `ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner`、`CPUAccounting` / `MemoryAccounting` / `TasksAccounting` / `NoNewPrivileges` 等 guardrail,且 service `inactive`、`MainPID=0`、未 enabled / 未 masked 時,才可保留 drain config / binary / unit,並輸出 `CONTROLLED_DRAIN_STAGING_ALLOWED=1` 與 textfile metric。此 staging 規則不得讀 token、不得讀 `.runner` 內容、不得註冊 runner、不得啟動 service;若 registration 缺失,readiness verifier 仍必須只留下 `controlled_cd_lane_registration_missing` / `controlled_cd_lane_service_not_active` 類 blocker。若 `CONTROLLED_DRAIN_STAGING_ALLOWED=0` 且 config / binary 又被搬走,優先修 source enforcer / unit guardrail,不要手工反覆補同一組 artifact。 diff --git a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md index 7bdf573f..55c6e54a 100644 --- a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md +++ b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md @@ -57,15 +57,15 @@ | 順序 | ID | 優先序 | 使用者插入要求 | 正規化工作項 | 目前狀態 | 下一個可驗證動作 | | --- | --- | --- | --- | --- | --- | --- | -| 1 | CIR-P0-RBT-001 | P0 | 「主機重啟後 10 分鐘內全部恢復,且要自動判斷所有主機被重啟」 | 建立 99/110/111/112/120/121/188 reboot event detector + 10 分鐘 SLO scorecard + fixed triage order | 部分已有 `reboot_recovery_slo_alerts`、scorecard、textfile;仍需要 fresh all-host reboot/drill 證明 | 產生最新 reboot SLO scorecard readback;若缺 fresh event,標 `awaiting_next_reboot_or_approved_drill`,不可宣稱 10 分鐘 SLA 已證明 | -| 2 | CIR-P0-RBT-002 | P0 | 「沒有偵測到主機重啟」 | 修正 host reboot/shutdown/up detection:boot_id / uptime / node exporter / Windows exporter / VMware VM power state 都要進同一事件 | Source verifier 已補:scorecard 可解析 Windows99 VMware readback;`windows99_verify_collection` 已補 no-secret collection packet;live 99 verify output 仍缺 | 收集 `windows99-vmware-autostart.ps1 -Mode Verify` no-secret output 後 rerun scorecard;缺 99 時不得把 110/120/121/188 green 當全主機 green | -| 3 | CIR-P0-RBT-003 | P0 | 「192.168.0.99 VMWare 要自動啟動,裡面 111/188/120/121/112 也自動啟動」 | Windows 99 VMware host autostart + guest VM autostart contract;VM host 111/188/120/121/112 開機順序與 readback | Source verifier / parser / API readback / collection packet 已完成;snapshot active blocker=`windows99_vmware_autostart_readback_missing` | 從 99 取得 no-secret Verify output,確認 `VMRUN_PRESENT`、scheduled task、VMware services、VM power、VMX present 全綠 | -| 4 | CIR-P0-RBT-004 | P0 | 「192.168.0.99 不可因 Windows Update 無預警重開」 | Windows Update reboot policy:active hours / no auto-restart / maintenance window / update notification audit | Source verifier 已補 `WINDOWS_UPDATE_POLICY` 與 `WINDOWS_UPDATE_NO_AUTO_REBOOT_READY`;collection packet 已列 forbidden actions;live 99 policy readback 仍缺 | 從 99 取得 Verify output;若 policy 不綠,再走 controlled apply,禁止要求或記錄 Windows 密碼 | +| 1 | CIR-P0-RBT-001 | P0 | 「主機重啟後 10 分鐘內全部恢復,且要自動判斷所有主機被重啟」 | 建立 99/110/111/112/120/121/188 reboot event detector + 10 分鐘 SLO scorecard + fixed triage order | 2026-07-02 15:08 live scorecard 已更新:readiness `43%`、active blockers `11`;`windows99_verify_collection` 與 `windows99_management_channel` 已進 API / scorecard;仍缺 fresh all-host 10 分鐘證明,111 不可達,99 uptime / VMware verifier 未閉環 | 優先收斂 99 no-secret management channel / verifier readback 與 111 reachability;不可宣稱 10 分鐘 SLA 已證明 | +| 2 | CIR-P0-RBT-002 | P0 | 「沒有偵測到主機重啟」 | 修正 host reboot/shutdown/up detection:boot_id / uptime / node exporter / Windows exporter / VMware VM power state 都要進同一事件 | Scorecard 已接 collection packet + management probe;99 host reachable 但 uptime unknown,111 unreachable,stale hosts 仍存在 | 讓 99 verifier / Windows exporter 或等效 no-secret readback 進入 host boot event,並補 111 reachability 證據 | +| 3 | CIR-P0-RBT-003 | P0 | 「192.168.0.99 VMWare 要自動啟動,裡面 111/188/120/121/112 也自動啟動」 | Windows 99 VMware host autostart + guest VM autostart contract;VM host 111/188/120/121/112 開機順序與 readback | Source verifier / parser / API readback / collection packet 已完成;management probe 讀回 `host_reachable=true`、RDP open、SSH BatchMode `permission_denied`、WinRM timeout;snapshot active blockers=`windows99_remote_execution_channel_unavailable`、`windows99_vmware_autostart_readback_missing` | 恢復 no-secret management channel 或收集 local console Verify output,再確認 `VMRUN_PRESENT`、scheduled task、VMware services、VM power、VMX present 全綠 | +| 4 | CIR-P0-RBT-004 | P0 | 「192.168.0.99 不可因 Windows Update 無預警重開」 | Windows Update reboot policy:active hours / no auto-restart / maintenance window / update notification audit | Source verifier 已補 `WINDOWS_UPDATE_POLICY` 與 `WINDOWS_UPDATE_NO_AUTO_REBOOT_READY`;collection packet 已列 forbidden actions;99 management channel 尚不能收 policy readback | 取得 Verify output;若 policy 不綠,再走 controlled apply,禁止要求或記錄 Windows 密碼 | | 5 | CIR-P0-RBT-005 | P0 | 「網站重啟後 502 嚴重影響體驗,要維護頁,外部雲端或專業做法」 | Public maintenance fallback:Nginx / edge / external static maintenance page / status page / fail-open UX,避免 502 直出 | 尚未完整落地;目前是需求缺口 | 產生 `public_maintenance_fallback` decision record:DNS/edge/外部雲端/本地 Nginx fallback 風險比較,先做不切流量的 check-mode | | 6 | CIR-P0-RBT-006 | P0 | 「所有主機關機立刻 Telegram 告警,重啟後也要告警,其他告警一併完整思考」 | Down / shutdown suspected / reboot detected / reboot recovered / SLO missed / backup failed / freshness stale / CPU pressure / Gitea queue 告警矩陣 | 部分已有 Alertmanager rule 與 Telegram receipt 補強;仍缺完整 shutdown/up E2E receipt | 建立 Telegram alert matrix + receipt verifier,逐項讀回 Alertmanager active/resolved 與 outbound receipt,不送測試 secret | | 7 | CIR-P0-RBT-007 | P0 | 「所有備份包含主機、DB、網站、服務、套件、工具、日誌都沒有監控告警」 | Backup observability coverage:backup job inventory、last success、freshness、offsite、restore drill、Telegram receipt | 部分已有 backup health exporter / alert rules;全域 coverage 與 restore drill 未全綠 | 建立 backup coverage matrix:host / DB / website / service config / package list / tool scripts / logs,每列有 metric、alert、last_success、restore_verifier | -| 8 | CIR-P0-RBT-008 | P0 | 「每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA」 | 固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command | 已補 scorecard source / snapshot / API 固定欄位:`current_phase`、`eta_or_wait_reason`、`primary_blocker`、`next_safe_action`;仍需 production deploy readback 與下一次 fresh reboot/drill 證明 | 推送後讀回 `/api/v1/agents/reboot-auto-recovery-slo-scorecard`,確認固定欄位在 production 可見;再接 99/VMware 與 fresh reboot/drill | -| 9 | CIR-P0-RBT-009 | P0 | 「所有產品、網站都要是最新版本;版本和數據是否最新要驗證」 | Product freshness/version matrix:source commit、deploy marker、runtime image、public health、data freshness、latest source availability | AWOOOI / StockPlatform 部分已在做;全產品未統一 | 建立全產品 readback 表:product、canonical repo、main SHA、deploy marker、public URL、data freshness、blocked reason | +| 8 | CIR-P0-RBT-008 | P0 | 「每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA」 | 固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command | Scorecard/API contract 已固定 `next_safe_action=restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot`;尚待 deploy readback | 推送後讀回 `/api/v1/agents/reboot-auto-recovery-slo-scorecard`,確認 production 顯示 99 management-channel blocker 與 readiness `43%` | +| 9 | CIR-P0-RBT-009 | P0 | 「所有產品、網站都要是最新版本;版本和數據是否最新要驗證」 | Product freshness/version matrix:source commit、deploy marker、runtime image、public health、data freshness、latest source availability | 本輪 StockPlatform public freshness / ingestion 讀回 `ok`,latest trading date `2026-07-01`;全產品版本矩陣仍未統一 | 建立全產品 readback 表:product、canonical repo、main SHA、deploy marker、public URL、data freshness、blocked reason | | 10 | CIR-P0-GIT-001 | P0 | 「Gitea 儲存庫都不見了?Gitea 沒完整備份嗎?」 | Gitea repository identity + backup proof + restore drill:不能只看 UI visible,要比對 SSH heads、repo path、bundle backup、restore sample | 已有 9 expected repos OK / backup health missing=0 的 handoff;仍需 restore drill 證明 | 補 Gitea repo bundle backup readback + sample restore dry-run verifier;禁止刪 repo / 改 visibility | | 11 | CIR-P0-CPU-001 | P0 | 「110 / 188 CPU 負載持續過高,為什麼沒監控告警、沒主動修復」 | Sustained CPU pressure automation:Alertmanager → controller → evidence → service playbook → verifier → KM writeback | 110 已有 `Host110SustainedModeratePressure`、Gitea playbook、Stock/Postgres evidence;188 仍需同級 controller/alerts readback | 下一步接 `postgres_hot_query_or_backup_export_playbook`;並補 188 equivalent readback,不以單次下降結案 | | 12 | CIR-P0-CPU-002 | P0 | 「噪音會影響真問題,要整合一起做」 | Alert noise / real issue correlation:backup aggregate noise、CPU pressure、Gitea queue、Stock freshness 要分清主因與次因 | 部分已在 SOP 註記;仍需統一 correlation scorecard | 建立 incident correlation readback:primary_blocker、secondary_noise、ignored_noise_reason、evidence_ref | diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index 139aea72..06c97965 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -741,6 +741,7 @@ def test_reboot_auto_recovery_slo_sources_stay_on_controlled_runtime_profile() - "scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh)", "scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py)", "scripts/reboot-recovery/windows99-vmware-autostart.ps1)", + "scripts/reboot-recovery/windows99-management-channel-probe.py)", "scripts/reboot-recovery/full-stack-cold-start-check.sh)", "scripts/reboot-recovery/full-stack-recovery-scorecard.sh)", "scripts/reboot-recovery/harbor-watchdog.sh)", @@ -753,6 +754,7 @@ def test_reboot_auto_recovery_slo_sources_stay_on_controlled_runtime_profile() - "scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py)", "scripts/reboot-recovery/tests/test_harbor_watchdog_contract.py)", "../../scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py", + "../../scripts/reboot-recovery/windows99-management-channel-probe.py", "../../scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py", "../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py", "../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py", @@ -804,6 +806,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N "scripts/reboot-recovery/awoooi-startup-110.sh)", "scripts/reboot-recovery/harbor-watchdog.sh)", "scripts/reboot-recovery/windows99-vmware-autostart.ps1)", + "scripts/reboot-recovery/windows99-management-channel-probe.py)", "scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)", "scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)", "scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)", diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh index 8e423e77..e5c9923b 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh @@ -35,6 +35,7 @@ summary_file="$artifact_dir/summary.txt" scorecard_file="$artifact_dir/scorecard.json" stock_freshness_file="$artifact_dir/stock-freshness.json" stock_ingestion_file="$artifact_dir/stock-ingestion.json" +windows99_management_file="$artifact_dir/windows99-management-channel.json" reboot_event_state_file="${REBOOT_EVENT_STATE_FILE:-${LOG_DIR}/reboot-event-state.json}" bash "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" >"$host_probe" 2>&1 || true @@ -56,6 +57,12 @@ if command -v curl >/dev/null 2>&1; then || rm -f "$stock_ingestion_file" fi +if [ -f "$ROOT_DIR/scripts/reboot-recovery/windows99-management-channel-probe.py" ]; then + python3 "$ROOT_DIR/scripts/reboot-recovery/windows99-management-channel-probe.py" \ + --output "$windows99_management_file" >"$artifact_dir/windows99-management-channel.stdout" 2>"$artifact_dir/windows99-management-channel.err" \ + || rm -f "$windows99_management_file" +fi + scorecard_args=( "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py" --summary-file "$summary_file" \ @@ -72,6 +79,9 @@ fi if [ -s "$stock_ingestion_file" ]; then scorecard_args+=(--stock-ingestion-file "$stock_ingestion_file") fi +if [ -s "$windows99_management_file" ]; then + scorecard_args+=(--windows99-management-file "$windows99_management_file") +fi python3 "${scorecard_args[@]}" || true diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index fe58253b..896cc640 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -49,6 +49,11 @@ def parse_args() -> argparse.Namespace: type=Path, help="Optional windows99-vmware-autostart.ps1 Verify output.", ) + parser.add_argument( + "--windows99-management-file", + type=Path, + help="Optional no-secret Windows 99 management-channel JSON readback.", + ) parser.add_argument("--generated-at", help="Override generated_at for stable snapshots.") parser.add_argument( "--required-host", @@ -259,6 +264,67 @@ def parse_windows99_vmware_readback(text: str) -> dict[str, Any]: } +def parse_windows99_management_readback(path: Path | None) -> dict[str, Any]: + """Parse no-secret Windows 99 management-channel readback JSON.""" + default = { + "readback_present": False, + "host": "192.168.0.99", + "host_reachable": False, + "tcp_ports": {}, + "ssh_batch": {"checked": False, "ready": False, "status": "missing"}, + "winrm_http_open": False, + "winrm_https_open": False, + "rdp_console_reachable": False, + "remote_execution_channel_ready": False, + "can_collect_vmware_verify_without_secret": False, + "blockers": [], + } + if not path: + return default + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError): + blocked = dict(default) + blocked["blockers"] = ["windows99_management_channel_readback_invalid"] + return blocked + if not isinstance(payload, dict): + blocked = dict(default) + blocked["blockers"] = ["windows99_management_channel_readback_invalid"] + return blocked + + tcp_ports = payload.get("tcp_ports") + if not isinstance(tcp_ports, dict): + tcp_ports = {} + ssh_batch = payload.get("ssh_batch") + if not isinstance(ssh_batch, dict): + ssh_batch = {"checked": False, "ready": False, "status": "missing"} + return { + "readback_present": True, + "schema_version": str(payload.get("schema_version") or "unknown"), + "generated_at": str(payload.get("generated_at") or ""), + "host": str(payload.get("host") or "192.168.0.99"), + "host_reachable": payload.get("host_reachable") is True, + "tcp_ports": {str(key): str(value) for key, value in tcp_ports.items()}, + "ssh_user": str(payload.get("ssh_user") or ""), + "ssh_batch": { + "checked": ssh_batch.get("checked") is True, + "ready": ssh_batch.get("ready") is True, + "status": str(ssh_batch.get("status") or "unknown"), + }, + "winrm_http_open": payload.get("winrm_http_open") is True, + "winrm_https_open": payload.get("winrm_https_open") is True, + "rdp_console_reachable": payload.get("rdp_console_reachable") is True, + "remote_execution_channel_ready": ( + payload.get("remote_execution_channel_ready") is True + ), + "can_collect_vmware_verify_without_secret": ( + payload.get("can_collect_vmware_verify_without_secret") is True + ), + "blockers": strings(payload.get("blockers")), + "forbidden_actions": strings(payload.get("forbidden_actions")), + } + + def read_json_object(path: Path | None) -> dict[str, Any]: if not path: return {} @@ -362,6 +428,11 @@ def source_controls() -> dict[str, bool]: "WINDOWS_UPDATE_POLICY", "VM_POWER", ), + "windows99_management_channel_probe_source_present": file_contains( + source_file("scripts/reboot-recovery/windows99-management-channel-probe.py"), + "windows99_management_channel_readback_v1", + "remote_execution_channel_ready", + ), "public_maintenance_fallback_source_present": source_file( "ops/maintenance/maintenance.html" ).exists() @@ -836,6 +907,11 @@ def choose_safe_next_step( "restore_docker_stats_textfile_exporter_then_collect_sanitized_host_" "pressure_no_restart_no_secret_read" ) + if "windows99_remote_execution_channel_unavailable" in blockers: + return ( + "restore_windows99_no_secret_management_channel_or_collect_local_console_" + "verify_readback_then_rerun_reboot_scorecard_no_reboot" + ) if any(blocker.startswith("windows99_") for blocker in blockers): return ( "collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_" @@ -873,6 +949,12 @@ def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]: windows99 = payload.get("windows99_vmware_autostart") if not isinstance(windows99, dict): windows99 = {} + windows99_management = payload.get("windows99_management_channel") + if not isinstance(windows99_management, dict): + windows99_management = {} + ssh_batch = windows99_management.get("ssh_batch") + if not isinstance(ssh_batch, dict): + ssh_batch = {} return { "source_controls_present": source_controls_present, @@ -886,6 +968,10 @@ def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]: "windows_update_no_auto_reboot_ready": ( windows99.get("windows_update_no_auto_reboot_ready") is True ), + "windows99_management_channel_ready": ( + windows99.get("verify_ready") is True + or windows99_management.get("remote_execution_channel_ready") is True + ), "service_green": post_reboot_readiness.get("service_green") is True, "product_data_green": post_reboot_readiness.get("product_data_green") is True, "backup_core_green": post_reboot_readiness.get("backup_core_green") is True, @@ -1059,6 +1145,12 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: windows99 = payload.get("windows99_vmware_autostart") if not isinstance(windows99, dict): windows99 = {} + windows99_management = payload.get("windows99_management_channel") + if not isinstance(windows99_management, dict): + windows99_management = {} + ssh_batch = windows99_management.get("ssh_batch") + if not isinstance(ssh_batch, dict): + ssh_batch = {} capacity = payload.get("capacity") if not isinstance(capacity, dict): capacity = {} @@ -1171,6 +1263,27 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: "windows99_host99_uptime_known": ( windows99_verify_collection["host99_uptime_known"] is True ), + "windows99_management_readback_present": ( + windows99_management.get("readback_present") is True + ), + "windows99_host_reachable": windows99_management.get("host_reachable") is True, + "windows99_remote_execution_channel_ready": ( + windows99_management.get("remote_execution_channel_ready") is True + ), + "windows99_can_collect_vmware_verify_without_secret": ( + windows99_management.get("can_collect_vmware_verify_without_secret") is True + ), + "windows99_ssh_batch_ready": ssh_batch.get("ready") is True, + "windows99_ssh_batch_status": str(ssh_batch.get("status") or "unknown"), + "windows99_winrm_http_open": ( + windows99_management.get("winrm_http_open") is True + ), + "windows99_winrm_https_open": ( + windows99_management.get("winrm_https_open") is True + ), + "windows99_rdp_console_reachable": ( + windows99_management.get("rdp_console_reachable") is True + ), "capacity_checked": capacity.get("checked") is True, "capacity_free_gib": capacity.get("free_gib"), "capacity_min_free_gib": capacity.get("min_free_gib"), @@ -1204,6 +1317,10 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: "windows99_verify_collection_can_collect_no_secret": rollups[ "windows99_verify_collection_can_collect_no_secret" ], + "windows99_remote_execution_channel_ready": rollups[ + "windows99_remote_execution_channel_ready" + ], + "windows99_ssh_batch_status": rollups["windows99_ssh_batch_status"], "runtime_write_authorized_by_this_scorecard": False, "host_reboot_authorized_by_this_scorecard": False, "workflow_trigger_authorized_by_this_scorecard": False, @@ -1286,6 +1403,9 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: ) host_pressure = build_host_pressure_readback(read_json_object(args.host_pressure_file)) windows99 = parse_windows99_vmware_readback(read_text(args.windows99_vmware_file)) + windows99_management = parse_windows99_management_readback( + args.windows99_management_file + ) controls = source_controls() free_gib = disk_free_gib(args.disk_path) @@ -1367,6 +1487,12 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: blockers.append("local_disk_free_below_minimum") blockers.extend(strings(host_pressure.get("blockers"))) blockers.extend(strings(windows99.get("blockers"))) + if ( + windows99.get("readback_present") is False + and windows99_management.get("readback_present") is True + and windows99_management.get("remote_execution_channel_ready") is not True + ): + blockers.append("windows99_remote_execution_channel_unavailable") max_uptime = max( [int_value(row.get("uptime_seconds"), 0) for row in host_rows if row.get("reachable")] @@ -1449,6 +1575,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: "stockplatform_data_freshness": stockplatform, "host_pressure": host_pressure, "windows99_vmware_autostart": windows99, + "windows99_management_channel": windows99_management, "capacity": { "checked": free_gib is not None, "free_gib": round(free_gib, 3) if free_gib is not None else None, diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index b8534850..b0b9f2e9 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -91,11 +91,50 @@ VMWARE_AUTOSTART_VERIFY_READY=1 """ +WINDOWS99_MANAGEMENT_BLOCKED = { + "schema_version": "windows99_management_channel_readback_v1", + "generated_at": "2026-07-02T15:20:00+08:00", + "host": "192.168.0.99", + "host_reachable": True, + "tcp_ports": { + "22": "open", + "135": "open", + "445": "open", + "3389": "open", + "5985": "timeout", + "5986": "timeout", + }, + "ssh_user": "administrator", + "ssh_batch": { + "checked": True, + "ready": False, + "status": "permission_denied", + }, + "winrm_http_open": False, + "winrm_https_open": False, + "rdp_console_reachable": True, + "remote_execution_channel_ready": False, + "can_collect_vmware_verify_without_secret": False, + "blockers": [ + "windows99_remote_execution_channel_unavailable", + "windows99_winrm_unavailable", + "windows99_ssh_batch_denied", + ], + "forbidden_actions": [ + "read_windows_password", + "read_secret_value", + "start_vm", + "reboot_host", + ], +} + + def run_scorecard( tmp_path: Path, summary: str, probe: str = HOST_PROBE_GREEN, windows99: str = WINDOWS99_VMWARE_GREEN, + windows99_management: str | None = None, ) -> dict: summary_path = tmp_path / "summary.txt" probe_path = tmp_path / "probe.txt" @@ -105,21 +144,26 @@ def run_scorecard( probe_path.write_text(probe, encoding="utf-8") reboot_event_path.write_text(json.dumps(REBOOT_EVENT_GREEN), encoding="utf-8") windows99_path.write_text(windows99, encoding="utf-8") + windows99_management_path = tmp_path / "windows99-management.json" + args = [ + sys.executable, + str(SCRIPT), + "--summary-file", + str(summary_path), + "--host-probe-file", + str(probe_path), + "--reboot-event-file", + str(reboot_event_path), + "--windows99-vmware-file", + str(windows99_path), + "--generated-at", + "2026-06-29T14:30:00+08:00", + ] + if windows99_management is not None: + windows99_management_path.write_text(windows99_management, encoding="utf-8") + args.extend(["--windows99-management-file", str(windows99_management_path)]) result = subprocess.run( - [ - sys.executable, - str(SCRIPT), - "--summary-file", - str(summary_path), - "--host-probe-file", - str(probe_path), - "--reboot-event-file", - str(reboot_event_path), - "--windows99-vmware-file", - str(windows99_path), - "--generated-at", - "2026-06-29T14:30:00+08:00", - ], + args, text=True, capture_output=True, check=True, @@ -239,6 +283,7 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) - payload["readback"]["windows99_verify_collection_can_collect_no_secret"] is False ) + assert payload["required_checks"]["windows99_management_channel_ready"] is True assert payload["rollups"]["source_controls_present"] is True assert payload["rollups"]["windows99_vmware_verify_ready"] is True assert payload["rollups"]["windows99_update_no_auto_reboot_ready"] is True @@ -332,6 +377,41 @@ def test_missing_windows99_vmware_readback_fails_closed(tmp_path: Path) -> None: ) +def test_windows99_management_channel_unavailable_is_visible(tmp_path: Path) -> None: + payload = run_scorecard( + tmp_path, + GREEN_SUMMARY, + windows99="", + windows99_management=json.dumps(WINDOWS99_MANAGEMENT_BLOCKED), + ) + + assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready" + assert payload["can_claim_all_services_recovered_within_target"] is False + assert payload["active_blockers"] == [ + "windows99_remote_execution_channel_unavailable", + "windows99_vmware_autostart_readback_missing", + ] + assert payload["safe_next_step"] == ( + "restore_windows99_no_secret_management_channel_or_collect_local_console_" + "verify_readback_then_rerun_reboot_scorecard_no_reboot" + ) + assert payload["windows99_management_channel"]["readback_present"] is True + assert payload["windows99_management_channel"]["host_reachable"] is True + assert ( + payload["windows99_management_channel"]["remote_execution_channel_ready"] + is False + ) + assert payload["windows99_management_channel"]["ssh_batch"]["status"] == ( + "permission_denied" + ) + assert payload["windows99_management_channel"]["rdp_console_reachable"] is True + assert payload["rollups"]["windows99_management_readback_present"] is True + assert payload["rollups"]["windows99_host_reachable"] is True + assert payload["rollups"]["windows99_remote_execution_channel_ready"] is False + assert payload["rollups"]["windows99_ssh_batch_status"] == "permission_denied" + assert payload["readback"]["windows99_remote_execution_channel_ready"] is False + + def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> None: summary = GREEN_SUMMARY.replace("WAZUH_DASHBOARD_DEGRADED=0", "WAZUH_DASHBOARD_DEGRADED=1") probe = HOST_PROBE_GREEN.replace("uptime_seconds=150", "uptime_seconds=900") diff --git a/scripts/reboot-recovery/windows99-management-channel-probe.py b/scripts/reboot-recovery/windows99-management-channel-probe.py new file mode 100644 index 00000000..0c81bff6 --- /dev/null +++ b/scripts/reboot-recovery/windows99-management-channel-probe.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +"""No-secret Windows 99 management-channel readback. + +This probe only checks whether a command channel is available for collecting the +Windows 99 VMware verifier. It does not read credentials, start VMs, or change +Windows/VMware state. +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import socket +import subprocess +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + + +SCHEMA_VERSION = "windows99_management_channel_readback_v1" +DEFAULT_PORTS = (22, 135, 445, 3389, 5985, 5986) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Probe no-secret management channels for Windows host 99.", + ) + parser.add_argument("--host", default="192.168.0.99") + parser.add_argument("--ssh-user", default="administrator") + parser.add_argument("--tcp-timeout", type=float, default=2.0) + parser.add_argument("--ssh-timeout", type=int, default=8) + parser.add_argument( + "--port", + action="append", + type=int, + dest="ports", + help="TCP port to probe. May be passed more than once.", + ) + parser.add_argument("--skip-ssh", action="store_true") + parser.add_argument("--generated-at", help="Override generated_at.") + parser.add_argument("--output", type=Path, help="Write JSON to this path.") + return parser.parse_args() + + +def tcp_status(host: str, port: int, timeout: float) -> str: + try: + with socket.create_connection((host, port), timeout=timeout): + return "open" + except TimeoutError: + return "timeout" + except ConnectionRefusedError: + return "refused" + except OSError as exc: + name = exc.__class__.__name__ + if getattr(exc, "errno", None) is not None: + return f"{name}_{exc.errno}" + return name + + +def ping_status(host: str) -> dict[str, Any]: + if not shutil.which("ping"): + return {"checked": False, "ok": False, "status": "ping_missing"} + timeout_arg = "1000" if sys.platform == "darwin" else "1" + command = ["ping", "-c", "2", "-W", timeout_arg, host] + try: + result = subprocess.run( + command, + text=True, + capture_output=True, + timeout=5, + check=False, + ) + except subprocess.TimeoutExpired: + return {"checked": True, "ok": False, "status": "timeout"} + return { + "checked": True, + "ok": result.returncode == 0, + "status": "ok" if result.returncode == 0 else "failed", + } + + +def classify_ssh_failure(stderr: str, returncode: int) -> str: + lowered = stderr.lower() + if "permission denied" in lowered: + return "permission_denied" + if "connection timed out" in lowered or "operation timed out" in lowered: + return "timeout" + if "connection refused" in lowered: + return "refused" + if "no route to host" in lowered: + return "no_route" + if returncode == 124: + return "timeout" + return "failed" + + +def ssh_batch_status(host: str, user: str, timeout: int, port_open: bool) -> dict[str, Any]: + if not port_open: + return {"checked": False, "ready": False, "status": "port_not_open"} + ssh = shutil.which("ssh") + if not ssh: + return {"checked": False, "ready": False, "status": "ssh_missing"} + command = [ + ssh, + "-o", + "BatchMode=yes", + "-o", + f"ConnectTimeout={timeout}", + "-o", + "PreferredAuthentications=publickey", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "GlobalKnownHostsFile=/dev/null", + f"{user}@{host}", + "cmd", + "/c", + "echo", + "AWOOOI_WINDOWS99_SSH_OK", + ] + try: + result = subprocess.run( + command, + text=True, + capture_output=True, + timeout=timeout + 3, + check=False, + ) + except subprocess.TimeoutExpired: + return {"checked": True, "ready": False, "status": "timeout"} + + ready = ( + result.returncode == 0 + and "AWOOOI_WINDOWS99_SSH_OK" in (result.stdout or "") + ) + return { + "checked": True, + "ready": ready, + "status": "ready" + if ready + else classify_ssh_failure(result.stderr or "", result.returncode), + } + + +def build_payload(args: argparse.Namespace) -> dict[str, Any]: + ports = tuple(args.ports or DEFAULT_PORTS) + generated_at = args.generated_at or datetime.now().astimezone().isoformat(timespec="seconds") + tcp_ports = { + str(port): tcp_status(args.host, port, args.tcp_timeout) + for port in ports + } + ping = ping_status(args.host) + host_reachable = ping["ok"] or any(status == "open" for status in tcp_ports.values()) + winrm_http_open = tcp_ports.get("5985") == "open" + winrm_https_open = tcp_ports.get("5986") == "open" + rdp_console_reachable = tcp_ports.get("3389") == "open" + ssh_probe = ( + {"checked": False, "ready": False, "status": "skipped"} + if args.skip_ssh + else ssh_batch_status( + args.host, + args.ssh_user, + args.ssh_timeout, + tcp_ports.get("22") == "open", + ) + ) + remote_execution_ready = ssh_probe["ready"] is True + blockers: list[str] = [] + if not host_reachable: + blockers.append("windows99_host_unreachable_from_management_probe") + if not remote_execution_ready: + blockers.append("windows99_remote_execution_channel_unavailable") + if not (winrm_http_open or winrm_https_open): + blockers.append("windows99_winrm_unavailable") + if tcp_ports.get("22") == "open" and ssh_probe["status"] == "permission_denied": + blockers.append("windows99_ssh_batch_denied") + + return { + "schema_version": SCHEMA_VERSION, + "generated_at": generated_at, + "host": args.host, + "ping": ping, + "host_reachable": host_reachable, + "tcp_ports": tcp_ports, + "ssh_user": args.ssh_user, + "ssh_batch": ssh_probe, + "winrm_http_open": winrm_http_open, + "winrm_https_open": winrm_https_open, + "rdp_console_reachable": rdp_console_reachable, + "remote_execution_channel_ready": remote_execution_ready, + "can_collect_vmware_verify_without_secret": remote_execution_ready, + "blockers": blockers, + "forbidden_actions": [ + "read_windows_password", + "read_secret_value", + "start_vm", + "reboot_host", + "restart_service", + "write_windows_policy", + ], + } + + +def main() -> int: + args = parse_args() + payload = build_payload(args) + output = json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n" + if args.output: + args.output.write_text(output, encoding="utf-8") + else: + sys.stdout.write(output) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())