fix(recovery): expose reboot slo machine readback
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
ogt
2026-07-01 20:35:22 +08:00
parent 68c982fe6a
commit 17b6bf36ce
6 changed files with 358 additions and 3 deletions

View File

@@ -589,6 +589,206 @@ def choose_safe_next_step(
)
def percent(value: float) -> int:
return max(0, min(100, round(value)))
def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]:
controls = payload.get("source_controls")
source_controls_present = (
isinstance(controls, dict) and bool(controls) and all(controls.values())
)
host_boot_detection = payload.get("host_boot_detection")
if not isinstance(host_boot_detection, dict):
host_boot_detection = {}
post_reboot_readiness = payload.get("post_reboot_readiness")
if not isinstance(post_reboot_readiness, dict):
post_reboot_readiness = {}
stockplatform = payload.get("stockplatform_data_freshness")
if not isinstance(stockplatform, dict):
stockplatform = {}
return {
"source_controls_present": source_controls_present,
"required_hosts_observed": not strings(host_boot_detection.get("missing_hosts")),
"required_hosts_reachable": not strings(
host_boot_detection.get("unreachable_hosts")
),
"service_green": post_reboot_readiness.get("service_green") is True,
"product_data_green": post_reboot_readiness.get("product_data_green") is True,
"backup_core_green": post_reboot_readiness.get("backup_core_green") is True,
"host_188_service_green": (
post_reboot_readiness.get("host_188_service_green") is True
),
"stockplatform_freshness_ok": (
stockplatform.get("freshness_status") == "ok"
),
"stockplatform_ingestion_ok": (
stockplatform.get("ingestion_status") in {"ok", "unknown"}
),
"fresh_reboot_window_observed": not strings(
host_boot_detection.get("stale_hosts")
),
"can_claim_slo": (
payload.get("can_claim_all_services_recovered_within_target") is True
),
}
def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
"""Add machine-readable P0-006 readback fields to the source scorecard."""
active_blockers = strings(payload.get("active_blockers"))
host_boot_detection = payload.get("host_boot_detection")
if not isinstance(host_boot_detection, dict):
host_boot_detection = {}
post_reboot_readiness = payload.get("post_reboot_readiness")
if not isinstance(post_reboot_readiness, dict):
post_reboot_readiness = {}
stockplatform = payload.get("stockplatform_data_freshness")
if not isinstance(stockplatform, dict):
stockplatform = {}
capacity = payload.get("capacity")
if not isinstance(capacity, dict):
capacity = {}
latest_verify_metric = payload.get("latest_verify_only_metric")
if not isinstance(latest_verify_metric, dict):
latest_verify_metric = {}
controls = payload.get("source_controls")
if not isinstance(controls, dict):
controls = {}
required_checks = build_required_checks(payload)
completed_check_count = sum(1 for value in required_checks.values() if value)
required_check_count = len(required_checks)
readiness_percent = percent(
completed_check_count / max(required_check_count, 1) * 100
)
if (
payload.get("can_claim_all_services_recovered_within_target") is True
and not active_blockers
):
readiness_percent = 100
blocked_by_fresh_reboot_window_only = active_blockers == [
"host_boot_observation_older_than_target_window"
]
source_control_ready_count = sum(1 for value in controls.values() if value)
source_controls_present = (
bool(controls) and source_control_ready_count == len(controls)
)
rollups = {
"active_blocker_count": len(active_blockers),
"readiness_percent": readiness_percent,
"completed_check_count": completed_check_count,
"required_check_count": required_check_count,
"source_control_count": len(controls),
"source_control_ready_count": source_control_ready_count,
"source_controls_present": source_controls_present,
"can_claim_all_services_recovered_within_target": (
payload.get("can_claim_all_services_recovered_within_target") is True
),
"observed_host_count": len(strings(host_boot_detection.get("observed_hosts"))),
"missing_host_count": len(strings(host_boot_detection.get("missing_hosts"))),
"unreachable_host_count": len(
strings(host_boot_detection.get("unreachable_hosts"))
),
"stale_host_count": len(strings(host_boot_detection.get("stale_hosts"))),
"unknown_uptime_host_count": len(
strings(host_boot_detection.get("unknown_uptime_hosts"))
),
"post_start_blocked": int_value(post_reboot_readiness.get("post_start_blocked")),
"service_green": post_reboot_readiness.get("service_green") is True,
"product_data_green": post_reboot_readiness.get("product_data_green") is True,
"backup_core_green": post_reboot_readiness.get("backup_core_green") is True,
"host_188_service_green": (
post_reboot_readiness.get("host_188_service_green") is True
),
"blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only,
"latest_verify_only_metric_present": bool(latest_verify_metric),
"latest_verify_only_metric_ready": int_value(latest_verify_metric.get("ready")),
"latest_verify_only_metric_blocker_count": int_value(
latest_verify_metric.get("blocker_count")
),
"latest_verify_only_metric_max_host_uptime_seconds": int_value(
latest_verify_metric.get("max_host_uptime_seconds")
),
"latest_verify_only_metric_last_run_timestamp": int_value(
latest_verify_metric.get("last_run_timestamp")
),
"stockplatform_freshness_status": str(
stockplatform.get("freshness_status") or "unknown"
),
"stockplatform_ingestion_status": str(
stockplatform.get("ingestion_status") or "unknown"
),
"stockplatform_freshness_blocker_count": len(
strings(stockplatform.get("freshness_blockers"))
),
"stockplatform_ingestion_blocker_count": len(
strings(stockplatform.get("ingestion_blockers"))
),
"capacity_checked": capacity.get("checked") is True,
"capacity_free_gib": capacity.get("free_gib"),
"capacity_min_free_gib": capacity.get("min_free_gib"),
"capacity_below_minimum": "local_disk_free_below_minimum" in active_blockers,
}
readback = {
"workplan_id": "P0-006",
"workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
"source_id": "reboot_auto_recovery_slo_scorecard",
"status": str(payload.get("status") or "unknown"),
"target_minutes": int_value(payload.get("target_minutes")),
"safe_next_step": str(payload.get("safe_next_step") or ""),
"active_blockers": active_blockers,
"active_blocker_count": len(active_blockers),
"readiness_percent": readiness_percent,
"blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only,
"required_checks": required_checks,
"source_controls_present": source_controls_present,
"runtime_write_authorized_by_this_scorecard": False,
"host_reboot_authorized_by_this_scorecard": False,
"workflow_trigger_authorized_by_this_scorecard": False,
"secret_value_collection_allowed": False,
}
summary = {
"reboot_auto_recovery_status": str(payload.get("status") or "unknown"),
"reboot_auto_recovery_workplan_id": "P0-006",
"reboot_auto_recovery_readiness_percent": readiness_percent,
"reboot_auto_recovery_active_blocker_count": len(active_blockers),
"reboot_auto_recovery_can_claim_slo": (
payload.get("can_claim_all_services_recovered_within_target") is True
),
"reboot_auto_recovery_service_green": rollups["service_green"],
"reboot_auto_recovery_product_data_green": rollups["product_data_green"],
"reboot_auto_recovery_backup_core_green": rollups["backup_core_green"],
"reboot_auto_recovery_host_188_service_green": rollups[
"host_188_service_green"
],
"reboot_auto_recovery_observed_host_count": rollups["observed_host_count"],
"reboot_auto_recovery_stale_host_count": rollups["stale_host_count"],
"reboot_auto_recovery_stockplatform_freshness_status": rollups[
"stockplatform_freshness_status"
],
"reboot_auto_recovery_stockplatform_ingestion_status": rollups[
"stockplatform_ingestion_status"
],
"reboot_auto_recovery_safe_next_step": readback["safe_next_step"],
"reboot_auto_recovery_source_controls_present": source_controls_present,
"secret_values_collected": False,
"github_api_used": False,
"workflow_trigger_performed": False,
"runtime_write_authorized": False,
}
payload["required_checks"] = required_checks
payload["readback"] = readback
payload["rollups"] = rollups
payload["summary"] = summary
return payload
def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
target_seconds = args.target_minutes * 60
generated_at = args.generated_at or datetime.now().astimezone().isoformat(timespec="seconds")
@@ -695,7 +895,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
stockplatform=stockplatform,
host_pressure=host_pressure,
)
return {
payload = {
"schema_version": SCHEMA_VERSION,
"generated_at": generated_at,
"target_minutes": args.target_minutes,
@@ -772,6 +972,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
"active_blockers": unique_blockers,
"safe_next_step": safe_next_step,
}
return enrich_machine_readback(payload)
def main() -> int:

View File

@@ -169,6 +169,15 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) -
assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1"
assert payload["status"] == "slo_ready"
assert payload["can_claim_all_services_recovered_within_target"] is True
assert payload["readback"]["workplan_id"] == "P0-006"
assert payload["readback"]["readiness_percent"] == 100
assert payload["readback"]["active_blocker_count"] == 0
assert payload["readback"]["runtime_write_authorized_by_this_scorecard"] is False
assert payload["rollups"]["source_controls_present"] is True
assert payload["rollups"]["readiness_percent"] == 100
assert payload["summary"]["reboot_auto_recovery_workplan_id"] == "P0-006"
assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is True
assert payload["summary"]["runtime_write_authorized"] is False
assert payload["source_controls"][
"host_110_startup_controlled_drain_guarded_autostart_source_present"
] is True
@@ -186,6 +195,11 @@ def test_missing_probe_fails_closed(tmp_path: Path) -> None:
assert payload["can_claim_all_services_recovered_within_target"] is False
assert "all_host_reboot_detection_missing" in payload["active_blockers"]
assert "host_boot_probe_missing_hosts" in payload["active_blockers"]
assert payload["readback"]["active_blocker_count"] == len(
payload["active_blockers"]
)
assert payload["rollups"]["readiness_percent"] < 100
assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is False
def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> None: