fix(reboot): expose windows99 management channel readback
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 41s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-02 15:19:09 +08:00
parent 25148d0752
commit aaa617f00f
14 changed files with 983 additions and 195 deletions

View File

@@ -35,6 +35,7 @@ summary_file="$artifact_dir/summary.txt"
scorecard_file="$artifact_dir/scorecard.json"
stock_freshness_file="$artifact_dir/stock-freshness.json"
stock_ingestion_file="$artifact_dir/stock-ingestion.json"
windows99_management_file="$artifact_dir/windows99-management-channel.json"
reboot_event_state_file="${REBOOT_EVENT_STATE_FILE:-${LOG_DIR}/reboot-event-state.json}"
bash "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" >"$host_probe" 2>&1 || true
@@ -56,6 +57,12 @@ if command -v curl >/dev/null 2>&1; then
|| rm -f "$stock_ingestion_file"
fi
if [ -f "$ROOT_DIR/scripts/reboot-recovery/windows99-management-channel-probe.py" ]; then
python3 "$ROOT_DIR/scripts/reboot-recovery/windows99-management-channel-probe.py" \
--output "$windows99_management_file" >"$artifact_dir/windows99-management-channel.stdout" 2>"$artifact_dir/windows99-management-channel.err" \
|| rm -f "$windows99_management_file"
fi
scorecard_args=(
"$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py"
--summary-file "$summary_file" \
@@ -72,6 +79,9 @@ fi
if [ -s "$stock_ingestion_file" ]; then
scorecard_args+=(--stock-ingestion-file "$stock_ingestion_file")
fi
if [ -s "$windows99_management_file" ]; then
scorecard_args+=(--windows99-management-file "$windows99_management_file")
fi
python3 "${scorecard_args[@]}" || true

View File

@@ -49,6 +49,11 @@ def parse_args() -> argparse.Namespace:
type=Path,
help="Optional windows99-vmware-autostart.ps1 Verify output.",
)
parser.add_argument(
"--windows99-management-file",
type=Path,
help="Optional no-secret Windows 99 management-channel JSON readback.",
)
parser.add_argument("--generated-at", help="Override generated_at for stable snapshots.")
parser.add_argument(
"--required-host",
@@ -259,6 +264,67 @@ def parse_windows99_vmware_readback(text: str) -> dict[str, Any]:
}
def parse_windows99_management_readback(path: Path | None) -> dict[str, Any]:
"""Parse no-secret Windows 99 management-channel readback JSON."""
default = {
"readback_present": False,
"host": "192.168.0.99",
"host_reachable": False,
"tcp_ports": {},
"ssh_batch": {"checked": False, "ready": False, "status": "missing"},
"winrm_http_open": False,
"winrm_https_open": False,
"rdp_console_reachable": False,
"remote_execution_channel_ready": False,
"can_collect_vmware_verify_without_secret": False,
"blockers": [],
}
if not path:
return default
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError):
blocked = dict(default)
blocked["blockers"] = ["windows99_management_channel_readback_invalid"]
return blocked
if not isinstance(payload, dict):
blocked = dict(default)
blocked["blockers"] = ["windows99_management_channel_readback_invalid"]
return blocked
tcp_ports = payload.get("tcp_ports")
if not isinstance(tcp_ports, dict):
tcp_ports = {}
ssh_batch = payload.get("ssh_batch")
if not isinstance(ssh_batch, dict):
ssh_batch = {"checked": False, "ready": False, "status": "missing"}
return {
"readback_present": True,
"schema_version": str(payload.get("schema_version") or "unknown"),
"generated_at": str(payload.get("generated_at") or ""),
"host": str(payload.get("host") or "192.168.0.99"),
"host_reachable": payload.get("host_reachable") is True,
"tcp_ports": {str(key): str(value) for key, value in tcp_ports.items()},
"ssh_user": str(payload.get("ssh_user") or ""),
"ssh_batch": {
"checked": ssh_batch.get("checked") is True,
"ready": ssh_batch.get("ready") is True,
"status": str(ssh_batch.get("status") or "unknown"),
},
"winrm_http_open": payload.get("winrm_http_open") is True,
"winrm_https_open": payload.get("winrm_https_open") is True,
"rdp_console_reachable": payload.get("rdp_console_reachable") is True,
"remote_execution_channel_ready": (
payload.get("remote_execution_channel_ready") is True
),
"can_collect_vmware_verify_without_secret": (
payload.get("can_collect_vmware_verify_without_secret") is True
),
"blockers": strings(payload.get("blockers")),
"forbidden_actions": strings(payload.get("forbidden_actions")),
}
def read_json_object(path: Path | None) -> dict[str, Any]:
if not path:
return {}
@@ -362,6 +428,11 @@ def source_controls() -> dict[str, bool]:
"WINDOWS_UPDATE_POLICY",
"VM_POWER",
),
"windows99_management_channel_probe_source_present": file_contains(
source_file("scripts/reboot-recovery/windows99-management-channel-probe.py"),
"windows99_management_channel_readback_v1",
"remote_execution_channel_ready",
),
"public_maintenance_fallback_source_present": source_file(
"ops/maintenance/maintenance.html"
).exists()
@@ -836,6 +907,11 @@ def choose_safe_next_step(
"restore_docker_stats_textfile_exporter_then_collect_sanitized_host_"
"pressure_no_restart_no_secret_read"
)
if "windows99_remote_execution_channel_unavailable" in blockers:
return (
"restore_windows99_no_secret_management_channel_or_collect_local_console_"
"verify_readback_then_rerun_reboot_scorecard_no_reboot"
)
if any(blocker.startswith("windows99_") for blocker in blockers):
return (
"collect_windows99_vmware_autostart_verify_readback_then_rerun_all_host_"
@@ -873,6 +949,12 @@ def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]:
windows99 = payload.get("windows99_vmware_autostart")
if not isinstance(windows99, dict):
windows99 = {}
windows99_management = payload.get("windows99_management_channel")
if not isinstance(windows99_management, dict):
windows99_management = {}
ssh_batch = windows99_management.get("ssh_batch")
if not isinstance(ssh_batch, dict):
ssh_batch = {}
return {
"source_controls_present": source_controls_present,
@@ -886,6 +968,10 @@ def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]:
"windows_update_no_auto_reboot_ready": (
windows99.get("windows_update_no_auto_reboot_ready") is True
),
"windows99_management_channel_ready": (
windows99.get("verify_ready") is True
or windows99_management.get("remote_execution_channel_ready") is True
),
"service_green": post_reboot_readiness.get("service_green") is True,
"product_data_green": post_reboot_readiness.get("product_data_green") is True,
"backup_core_green": post_reboot_readiness.get("backup_core_green") is True,
@@ -1059,6 +1145,12 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
windows99 = payload.get("windows99_vmware_autostart")
if not isinstance(windows99, dict):
windows99 = {}
windows99_management = payload.get("windows99_management_channel")
if not isinstance(windows99_management, dict):
windows99_management = {}
ssh_batch = windows99_management.get("ssh_batch")
if not isinstance(ssh_batch, dict):
ssh_batch = {}
capacity = payload.get("capacity")
if not isinstance(capacity, dict):
capacity = {}
@@ -1171,6 +1263,27 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
"windows99_host99_uptime_known": (
windows99_verify_collection["host99_uptime_known"] is True
),
"windows99_management_readback_present": (
windows99_management.get("readback_present") is True
),
"windows99_host_reachable": windows99_management.get("host_reachable") is True,
"windows99_remote_execution_channel_ready": (
windows99_management.get("remote_execution_channel_ready") is True
),
"windows99_can_collect_vmware_verify_without_secret": (
windows99_management.get("can_collect_vmware_verify_without_secret") is True
),
"windows99_ssh_batch_ready": ssh_batch.get("ready") is True,
"windows99_ssh_batch_status": str(ssh_batch.get("status") or "unknown"),
"windows99_winrm_http_open": (
windows99_management.get("winrm_http_open") is True
),
"windows99_winrm_https_open": (
windows99_management.get("winrm_https_open") is True
),
"windows99_rdp_console_reachable": (
windows99_management.get("rdp_console_reachable") is True
),
"capacity_checked": capacity.get("checked") is True,
"capacity_free_gib": capacity.get("free_gib"),
"capacity_min_free_gib": capacity.get("min_free_gib"),
@@ -1204,6 +1317,10 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
"windows99_verify_collection_can_collect_no_secret": rollups[
"windows99_verify_collection_can_collect_no_secret"
],
"windows99_remote_execution_channel_ready": rollups[
"windows99_remote_execution_channel_ready"
],
"windows99_ssh_batch_status": rollups["windows99_ssh_batch_status"],
"runtime_write_authorized_by_this_scorecard": False,
"host_reboot_authorized_by_this_scorecard": False,
"workflow_trigger_authorized_by_this_scorecard": False,
@@ -1286,6 +1403,9 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
)
host_pressure = build_host_pressure_readback(read_json_object(args.host_pressure_file))
windows99 = parse_windows99_vmware_readback(read_text(args.windows99_vmware_file))
windows99_management = parse_windows99_management_readback(
args.windows99_management_file
)
controls = source_controls()
free_gib = disk_free_gib(args.disk_path)
@@ -1367,6 +1487,12 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
blockers.append("local_disk_free_below_minimum")
blockers.extend(strings(host_pressure.get("blockers")))
blockers.extend(strings(windows99.get("blockers")))
if (
windows99.get("readback_present") is False
and windows99_management.get("readback_present") is True
and windows99_management.get("remote_execution_channel_ready") is not True
):
blockers.append("windows99_remote_execution_channel_unavailable")
max_uptime = max(
[int_value(row.get("uptime_seconds"), 0) for row in host_rows if row.get("reachable")]
@@ -1449,6 +1575,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
"stockplatform_data_freshness": stockplatform,
"host_pressure": host_pressure,
"windows99_vmware_autostart": windows99,
"windows99_management_channel": windows99_management,
"capacity": {
"checked": free_gib is not None,
"free_gib": round(free_gib, 3) if free_gib is not None else None,

View File

@@ -91,11 +91,50 @@ VMWARE_AUTOSTART_VERIFY_READY=1
"""
WINDOWS99_MANAGEMENT_BLOCKED = {
"schema_version": "windows99_management_channel_readback_v1",
"generated_at": "2026-07-02T15:20:00+08:00",
"host": "192.168.0.99",
"host_reachable": True,
"tcp_ports": {
"22": "open",
"135": "open",
"445": "open",
"3389": "open",
"5985": "timeout",
"5986": "timeout",
},
"ssh_user": "administrator",
"ssh_batch": {
"checked": True,
"ready": False,
"status": "permission_denied",
},
"winrm_http_open": False,
"winrm_https_open": False,
"rdp_console_reachable": True,
"remote_execution_channel_ready": False,
"can_collect_vmware_verify_without_secret": False,
"blockers": [
"windows99_remote_execution_channel_unavailable",
"windows99_winrm_unavailable",
"windows99_ssh_batch_denied",
],
"forbidden_actions": [
"read_windows_password",
"read_secret_value",
"start_vm",
"reboot_host",
],
}
def run_scorecard(
tmp_path: Path,
summary: str,
probe: str = HOST_PROBE_GREEN,
windows99: str = WINDOWS99_VMWARE_GREEN,
windows99_management: str | None = None,
) -> dict:
summary_path = tmp_path / "summary.txt"
probe_path = tmp_path / "probe.txt"
@@ -105,21 +144,26 @@ def run_scorecard(
probe_path.write_text(probe, encoding="utf-8")
reboot_event_path.write_text(json.dumps(REBOOT_EVENT_GREEN), encoding="utf-8")
windows99_path.write_text(windows99, encoding="utf-8")
windows99_management_path = tmp_path / "windows99-management.json"
args = [
sys.executable,
str(SCRIPT),
"--summary-file",
str(summary_path),
"--host-probe-file",
str(probe_path),
"--reboot-event-file",
str(reboot_event_path),
"--windows99-vmware-file",
str(windows99_path),
"--generated-at",
"2026-06-29T14:30:00+08:00",
]
if windows99_management is not None:
windows99_management_path.write_text(windows99_management, encoding="utf-8")
args.extend(["--windows99-management-file", str(windows99_management_path)])
result = subprocess.run(
[
sys.executable,
str(SCRIPT),
"--summary-file",
str(summary_path),
"--host-probe-file",
str(probe_path),
"--reboot-event-file",
str(reboot_event_path),
"--windows99-vmware-file",
str(windows99_path),
"--generated-at",
"2026-06-29T14:30:00+08:00",
],
args,
text=True,
capture_output=True,
check=True,
@@ -239,6 +283,7 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) -
payload["readback"]["windows99_verify_collection_can_collect_no_secret"]
is False
)
assert payload["required_checks"]["windows99_management_channel_ready"] is True
assert payload["rollups"]["source_controls_present"] is True
assert payload["rollups"]["windows99_vmware_verify_ready"] is True
assert payload["rollups"]["windows99_update_no_auto_reboot_ready"] is True
@@ -332,6 +377,41 @@ def test_missing_windows99_vmware_readback_fails_closed(tmp_path: Path) -> None:
)
def test_windows99_management_channel_unavailable_is_visible(tmp_path: Path) -> None:
payload = run_scorecard(
tmp_path,
GREEN_SUMMARY,
windows99="",
windows99_management=json.dumps(WINDOWS99_MANAGEMENT_BLOCKED),
)
assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready"
assert payload["can_claim_all_services_recovered_within_target"] is False
assert payload["active_blockers"] == [
"windows99_remote_execution_channel_unavailable",
"windows99_vmware_autostart_readback_missing",
]
assert payload["safe_next_step"] == (
"restore_windows99_no_secret_management_channel_or_collect_local_console_"
"verify_readback_then_rerun_reboot_scorecard_no_reboot"
)
assert payload["windows99_management_channel"]["readback_present"] is True
assert payload["windows99_management_channel"]["host_reachable"] is True
assert (
payload["windows99_management_channel"]["remote_execution_channel_ready"]
is False
)
assert payload["windows99_management_channel"]["ssh_batch"]["status"] == (
"permission_denied"
)
assert payload["windows99_management_channel"]["rdp_console_reachable"] is True
assert payload["rollups"]["windows99_management_readback_present"] is True
assert payload["rollups"]["windows99_host_reachable"] is True
assert payload["rollups"]["windows99_remote_execution_channel_ready"] is False
assert payload["rollups"]["windows99_ssh_batch_status"] == "permission_denied"
assert payload["readback"]["windows99_remote_execution_channel_ready"] is False
def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> None:
summary = GREEN_SUMMARY.replace("WAZUH_DASHBOARD_DEGRADED=0", "WAZUH_DASHBOARD_DEGRADED=1")
probe = HOST_PROBE_GREEN.replace("uptime_seconds=150", "uptime_seconds=900")

View File

@@ -0,0 +1,220 @@
#!/usr/bin/env python3
"""No-secret Windows 99 management-channel readback.
This probe only checks whether a command channel is available for collecting the
Windows 99 VMware verifier. It does not read credentials, start VMs, or change
Windows/VMware state.
"""
from __future__ import annotations
import argparse
import json
import shutil
import socket
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
SCHEMA_VERSION = "windows99_management_channel_readback_v1"
DEFAULT_PORTS = (22, 135, 445, 3389, 5985, 5986)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Probe no-secret management channels for Windows host 99.",
)
parser.add_argument("--host", default="192.168.0.99")
parser.add_argument("--ssh-user", default="administrator")
parser.add_argument("--tcp-timeout", type=float, default=2.0)
parser.add_argument("--ssh-timeout", type=int, default=8)
parser.add_argument(
"--port",
action="append",
type=int,
dest="ports",
help="TCP port to probe. May be passed more than once.",
)
parser.add_argument("--skip-ssh", action="store_true")
parser.add_argument("--generated-at", help="Override generated_at.")
parser.add_argument("--output", type=Path, help="Write JSON to this path.")
return parser.parse_args()
def tcp_status(host: str, port: int, timeout: float) -> str:
try:
with socket.create_connection((host, port), timeout=timeout):
return "open"
except TimeoutError:
return "timeout"
except ConnectionRefusedError:
return "refused"
except OSError as exc:
name = exc.__class__.__name__
if getattr(exc, "errno", None) is not None:
return f"{name}_{exc.errno}"
return name
def ping_status(host: str) -> dict[str, Any]:
if not shutil.which("ping"):
return {"checked": False, "ok": False, "status": "ping_missing"}
timeout_arg = "1000" if sys.platform == "darwin" else "1"
command = ["ping", "-c", "2", "-W", timeout_arg, host]
try:
result = subprocess.run(
command,
text=True,
capture_output=True,
timeout=5,
check=False,
)
except subprocess.TimeoutExpired:
return {"checked": True, "ok": False, "status": "timeout"}
return {
"checked": True,
"ok": result.returncode == 0,
"status": "ok" if result.returncode == 0 else "failed",
}
def classify_ssh_failure(stderr: str, returncode: int) -> str:
lowered = stderr.lower()
if "permission denied" in lowered:
return "permission_denied"
if "connection timed out" in lowered or "operation timed out" in lowered:
return "timeout"
if "connection refused" in lowered:
return "refused"
if "no route to host" in lowered:
return "no_route"
if returncode == 124:
return "timeout"
return "failed"
def ssh_batch_status(host: str, user: str, timeout: int, port_open: bool) -> dict[str, Any]:
if not port_open:
return {"checked": False, "ready": False, "status": "port_not_open"}
ssh = shutil.which("ssh")
if not ssh:
return {"checked": False, "ready": False, "status": "ssh_missing"}
command = [
ssh,
"-o",
"BatchMode=yes",
"-o",
f"ConnectTimeout={timeout}",
"-o",
"PreferredAuthentications=publickey",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-o",
"GlobalKnownHostsFile=/dev/null",
f"{user}@{host}",
"cmd",
"/c",
"echo",
"AWOOOI_WINDOWS99_SSH_OK",
]
try:
result = subprocess.run(
command,
text=True,
capture_output=True,
timeout=timeout + 3,
check=False,
)
except subprocess.TimeoutExpired:
return {"checked": True, "ready": False, "status": "timeout"}
ready = (
result.returncode == 0
and "AWOOOI_WINDOWS99_SSH_OK" in (result.stdout or "")
)
return {
"checked": True,
"ready": ready,
"status": "ready"
if ready
else classify_ssh_failure(result.stderr or "", result.returncode),
}
def build_payload(args: argparse.Namespace) -> dict[str, Any]:
ports = tuple(args.ports or DEFAULT_PORTS)
generated_at = args.generated_at or datetime.now().astimezone().isoformat(timespec="seconds")
tcp_ports = {
str(port): tcp_status(args.host, port, args.tcp_timeout)
for port in ports
}
ping = ping_status(args.host)
host_reachable = ping["ok"] or any(status == "open" for status in tcp_ports.values())
winrm_http_open = tcp_ports.get("5985") == "open"
winrm_https_open = tcp_ports.get("5986") == "open"
rdp_console_reachable = tcp_ports.get("3389") == "open"
ssh_probe = (
{"checked": False, "ready": False, "status": "skipped"}
if args.skip_ssh
else ssh_batch_status(
args.host,
args.ssh_user,
args.ssh_timeout,
tcp_ports.get("22") == "open",
)
)
remote_execution_ready = ssh_probe["ready"] is True
blockers: list[str] = []
if not host_reachable:
blockers.append("windows99_host_unreachable_from_management_probe")
if not remote_execution_ready:
blockers.append("windows99_remote_execution_channel_unavailable")
if not (winrm_http_open or winrm_https_open):
blockers.append("windows99_winrm_unavailable")
if tcp_ports.get("22") == "open" and ssh_probe["status"] == "permission_denied":
blockers.append("windows99_ssh_batch_denied")
return {
"schema_version": SCHEMA_VERSION,
"generated_at": generated_at,
"host": args.host,
"ping": ping,
"host_reachable": host_reachable,
"tcp_ports": tcp_ports,
"ssh_user": args.ssh_user,
"ssh_batch": ssh_probe,
"winrm_http_open": winrm_http_open,
"winrm_https_open": winrm_https_open,
"rdp_console_reachable": rdp_console_reachable,
"remote_execution_channel_ready": remote_execution_ready,
"can_collect_vmware_verify_without_secret": remote_execution_ready,
"blockers": blockers,
"forbidden_actions": [
"read_windows_password",
"read_secret_value",
"start_vm",
"reboot_host",
"restart_service",
"write_windows_policy",
],
}
def main() -> int:
args = parse_args()
payload = build_payload(args)
output = json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
if args.output:
args.output.write_text(output, encoding="utf-8")
else:
sys.stdout.write(output)
return 0
if __name__ == "__main__":
raise SystemExit(main())