fix(recovery): expose reboot slo machine readback
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -589,6 +589,206 @@ def choose_safe_next_step(
|
||||
)
|
||||
|
||||
|
||||
def percent(value: float) -> int:
|
||||
return max(0, min(100, round(value)))
|
||||
|
||||
|
||||
def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]:
|
||||
controls = payload.get("source_controls")
|
||||
source_controls_present = (
|
||||
isinstance(controls, dict) and bool(controls) and all(controls.values())
|
||||
)
|
||||
host_boot_detection = payload.get("host_boot_detection")
|
||||
if not isinstance(host_boot_detection, dict):
|
||||
host_boot_detection = {}
|
||||
post_reboot_readiness = payload.get("post_reboot_readiness")
|
||||
if not isinstance(post_reboot_readiness, dict):
|
||||
post_reboot_readiness = {}
|
||||
stockplatform = payload.get("stockplatform_data_freshness")
|
||||
if not isinstance(stockplatform, dict):
|
||||
stockplatform = {}
|
||||
|
||||
return {
|
||||
"source_controls_present": source_controls_present,
|
||||
"required_hosts_observed": not strings(host_boot_detection.get("missing_hosts")),
|
||||
"required_hosts_reachable": not strings(
|
||||
host_boot_detection.get("unreachable_hosts")
|
||||
),
|
||||
"service_green": post_reboot_readiness.get("service_green") is True,
|
||||
"product_data_green": post_reboot_readiness.get("product_data_green") is True,
|
||||
"backup_core_green": post_reboot_readiness.get("backup_core_green") is True,
|
||||
"host_188_service_green": (
|
||||
post_reboot_readiness.get("host_188_service_green") is True
|
||||
),
|
||||
"stockplatform_freshness_ok": (
|
||||
stockplatform.get("freshness_status") == "ok"
|
||||
),
|
||||
"stockplatform_ingestion_ok": (
|
||||
stockplatform.get("ingestion_status") in {"ok", "unknown"}
|
||||
),
|
||||
"fresh_reboot_window_observed": not strings(
|
||||
host_boot_detection.get("stale_hosts")
|
||||
),
|
||||
"can_claim_slo": (
|
||||
payload.get("can_claim_all_services_recovered_within_target") is True
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Add machine-readable P0-006 readback fields to the source scorecard."""
|
||||
active_blockers = strings(payload.get("active_blockers"))
|
||||
host_boot_detection = payload.get("host_boot_detection")
|
||||
if not isinstance(host_boot_detection, dict):
|
||||
host_boot_detection = {}
|
||||
post_reboot_readiness = payload.get("post_reboot_readiness")
|
||||
if not isinstance(post_reboot_readiness, dict):
|
||||
post_reboot_readiness = {}
|
||||
stockplatform = payload.get("stockplatform_data_freshness")
|
||||
if not isinstance(stockplatform, dict):
|
||||
stockplatform = {}
|
||||
capacity = payload.get("capacity")
|
||||
if not isinstance(capacity, dict):
|
||||
capacity = {}
|
||||
latest_verify_metric = payload.get("latest_verify_only_metric")
|
||||
if not isinstance(latest_verify_metric, dict):
|
||||
latest_verify_metric = {}
|
||||
controls = payload.get("source_controls")
|
||||
if not isinstance(controls, dict):
|
||||
controls = {}
|
||||
|
||||
required_checks = build_required_checks(payload)
|
||||
completed_check_count = sum(1 for value in required_checks.values() if value)
|
||||
required_check_count = len(required_checks)
|
||||
readiness_percent = percent(
|
||||
completed_check_count / max(required_check_count, 1) * 100
|
||||
)
|
||||
if (
|
||||
payload.get("can_claim_all_services_recovered_within_target") is True
|
||||
and not active_blockers
|
||||
):
|
||||
readiness_percent = 100
|
||||
blocked_by_fresh_reboot_window_only = active_blockers == [
|
||||
"host_boot_observation_older_than_target_window"
|
||||
]
|
||||
source_control_ready_count = sum(1 for value in controls.values() if value)
|
||||
source_controls_present = (
|
||||
bool(controls) and source_control_ready_count == len(controls)
|
||||
)
|
||||
|
||||
rollups = {
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"readiness_percent": readiness_percent,
|
||||
"completed_check_count": completed_check_count,
|
||||
"required_check_count": required_check_count,
|
||||
"source_control_count": len(controls),
|
||||
"source_control_ready_count": source_control_ready_count,
|
||||
"source_controls_present": source_controls_present,
|
||||
"can_claim_all_services_recovered_within_target": (
|
||||
payload.get("can_claim_all_services_recovered_within_target") is True
|
||||
),
|
||||
"observed_host_count": len(strings(host_boot_detection.get("observed_hosts"))),
|
||||
"missing_host_count": len(strings(host_boot_detection.get("missing_hosts"))),
|
||||
"unreachable_host_count": len(
|
||||
strings(host_boot_detection.get("unreachable_hosts"))
|
||||
),
|
||||
"stale_host_count": len(strings(host_boot_detection.get("stale_hosts"))),
|
||||
"unknown_uptime_host_count": len(
|
||||
strings(host_boot_detection.get("unknown_uptime_hosts"))
|
||||
),
|
||||
"post_start_blocked": int_value(post_reboot_readiness.get("post_start_blocked")),
|
||||
"service_green": post_reboot_readiness.get("service_green") is True,
|
||||
"product_data_green": post_reboot_readiness.get("product_data_green") is True,
|
||||
"backup_core_green": post_reboot_readiness.get("backup_core_green") is True,
|
||||
"host_188_service_green": (
|
||||
post_reboot_readiness.get("host_188_service_green") is True
|
||||
),
|
||||
"blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only,
|
||||
"latest_verify_only_metric_present": bool(latest_verify_metric),
|
||||
"latest_verify_only_metric_ready": int_value(latest_verify_metric.get("ready")),
|
||||
"latest_verify_only_metric_blocker_count": int_value(
|
||||
latest_verify_metric.get("blocker_count")
|
||||
),
|
||||
"latest_verify_only_metric_max_host_uptime_seconds": int_value(
|
||||
latest_verify_metric.get("max_host_uptime_seconds")
|
||||
),
|
||||
"latest_verify_only_metric_last_run_timestamp": int_value(
|
||||
latest_verify_metric.get("last_run_timestamp")
|
||||
),
|
||||
"stockplatform_freshness_status": str(
|
||||
stockplatform.get("freshness_status") or "unknown"
|
||||
),
|
||||
"stockplatform_ingestion_status": str(
|
||||
stockplatform.get("ingestion_status") or "unknown"
|
||||
),
|
||||
"stockplatform_freshness_blocker_count": len(
|
||||
strings(stockplatform.get("freshness_blockers"))
|
||||
),
|
||||
"stockplatform_ingestion_blocker_count": len(
|
||||
strings(stockplatform.get("ingestion_blockers"))
|
||||
),
|
||||
"capacity_checked": capacity.get("checked") is True,
|
||||
"capacity_free_gib": capacity.get("free_gib"),
|
||||
"capacity_min_free_gib": capacity.get("min_free_gib"),
|
||||
"capacity_below_minimum": "local_disk_free_below_minimum" in active_blockers,
|
||||
}
|
||||
|
||||
readback = {
|
||||
"workplan_id": "P0-006",
|
||||
"workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
|
||||
"source_id": "reboot_auto_recovery_slo_scorecard",
|
||||
"status": str(payload.get("status") or "unknown"),
|
||||
"target_minutes": int_value(payload.get("target_minutes")),
|
||||
"safe_next_step": str(payload.get("safe_next_step") or ""),
|
||||
"active_blockers": active_blockers,
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"readiness_percent": readiness_percent,
|
||||
"blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only,
|
||||
"required_checks": required_checks,
|
||||
"source_controls_present": source_controls_present,
|
||||
"runtime_write_authorized_by_this_scorecard": False,
|
||||
"host_reboot_authorized_by_this_scorecard": False,
|
||||
"workflow_trigger_authorized_by_this_scorecard": False,
|
||||
"secret_value_collection_allowed": False,
|
||||
}
|
||||
|
||||
summary = {
|
||||
"reboot_auto_recovery_status": str(payload.get("status") or "unknown"),
|
||||
"reboot_auto_recovery_workplan_id": "P0-006",
|
||||
"reboot_auto_recovery_readiness_percent": readiness_percent,
|
||||
"reboot_auto_recovery_active_blocker_count": len(active_blockers),
|
||||
"reboot_auto_recovery_can_claim_slo": (
|
||||
payload.get("can_claim_all_services_recovered_within_target") is True
|
||||
),
|
||||
"reboot_auto_recovery_service_green": rollups["service_green"],
|
||||
"reboot_auto_recovery_product_data_green": rollups["product_data_green"],
|
||||
"reboot_auto_recovery_backup_core_green": rollups["backup_core_green"],
|
||||
"reboot_auto_recovery_host_188_service_green": rollups[
|
||||
"host_188_service_green"
|
||||
],
|
||||
"reboot_auto_recovery_observed_host_count": rollups["observed_host_count"],
|
||||
"reboot_auto_recovery_stale_host_count": rollups["stale_host_count"],
|
||||
"reboot_auto_recovery_stockplatform_freshness_status": rollups[
|
||||
"stockplatform_freshness_status"
|
||||
],
|
||||
"reboot_auto_recovery_stockplatform_ingestion_status": rollups[
|
||||
"stockplatform_ingestion_status"
|
||||
],
|
||||
"reboot_auto_recovery_safe_next_step": readback["safe_next_step"],
|
||||
"reboot_auto_recovery_source_controls_present": source_controls_present,
|
||||
"secret_values_collected": False,
|
||||
"github_api_used": False,
|
||||
"workflow_trigger_performed": False,
|
||||
"runtime_write_authorized": False,
|
||||
}
|
||||
|
||||
payload["required_checks"] = required_checks
|
||||
payload["readback"] = readback
|
||||
payload["rollups"] = rollups
|
||||
payload["summary"] = summary
|
||||
return payload
|
||||
|
||||
|
||||
def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
|
||||
target_seconds = args.target_minutes * 60
|
||||
generated_at = args.generated_at or datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
@@ -695,7 +895,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
|
||||
stockplatform=stockplatform,
|
||||
host_pressure=host_pressure,
|
||||
)
|
||||
return {
|
||||
payload = {
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"generated_at": generated_at,
|
||||
"target_minutes": args.target_minutes,
|
||||
@@ -772,6 +972,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
|
||||
"active_blockers": unique_blockers,
|
||||
"safe_next_step": safe_next_step,
|
||||
}
|
||||
return enrich_machine_readback(payload)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
|
||||
@@ -169,6 +169,15 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) -
|
||||
assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1"
|
||||
assert payload["status"] == "slo_ready"
|
||||
assert payload["can_claim_all_services_recovered_within_target"] is True
|
||||
assert payload["readback"]["workplan_id"] == "P0-006"
|
||||
assert payload["readback"]["readiness_percent"] == 100
|
||||
assert payload["readback"]["active_blocker_count"] == 0
|
||||
assert payload["readback"]["runtime_write_authorized_by_this_scorecard"] is False
|
||||
assert payload["rollups"]["source_controls_present"] is True
|
||||
assert payload["rollups"]["readiness_percent"] == 100
|
||||
assert payload["summary"]["reboot_auto_recovery_workplan_id"] == "P0-006"
|
||||
assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is True
|
||||
assert payload["summary"]["runtime_write_authorized"] is False
|
||||
assert payload["source_controls"][
|
||||
"host_110_startup_controlled_drain_guarded_autostart_source_present"
|
||||
] is True
|
||||
@@ -186,6 +195,11 @@ def test_missing_probe_fails_closed(tmp_path: Path) -> None:
|
||||
assert payload["can_claim_all_services_recovered_within_target"] is False
|
||||
assert "all_host_reboot_detection_missing" in payload["active_blockers"]
|
||||
assert "host_boot_probe_missing_hosts" in payload["active_blockers"]
|
||||
assert payload["readback"]["active_blocker_count"] == len(
|
||||
payload["active_blockers"]
|
||||
)
|
||||
assert payload["rollups"]["readiness_percent"] < 100
|
||||
assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is False
|
||||
|
||||
|
||||
def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> None:
|
||||
|
||||
Reference in New Issue
Block a user