fix(reboot): expose fixed recovery sop progress
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 51s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
AI 技術雷達監控 / ai-technology-watch (push) Successful in 38s

This commit is contained in:
Your Name
2026-07-02 13:54:34 +08:00
parent ef79edd872
commit f9469bcc21
9 changed files with 399 additions and 5 deletions

View File

@@ -635,6 +635,131 @@ def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]:
}
def reboot_sop_current_phase(active_blockers: list[str], can_claim: bool) -> str:
if can_claim and not active_blockers:
return "slo_ready"
if active_blockers == ["host_boot_observation_older_than_target_window"]:
return "awaiting_next_reboot_or_approved_drill"
host_boot_blockers = {
"all_host_reboot_detection_missing",
"stateful_reboot_event_detection_missing",
"host_boot_probe_missing_hosts",
"host_unreachable_after_reboot",
"host_boot_observation_older_than_target_window",
"host_uptime_unknown",
"reboot_event_missing_required_hosts",
"reboot_event_required_host_unreachable",
"fresh_all_host_reboot_event_missing",
"all_required_hosts_not_in_10_minute_reboot_window",
}
if any(blocker in host_boot_blockers for blocker in active_blockers):
return "host_boot_detection_blocked"
service_blockers = {
"post_reboot_summary_missing",
"post_start_blocked_not_zero",
"service_green_not_1",
"host_188_service_green_not_1",
"wazuh_dashboard_degraded",
}
if any(blocker in service_blockers for blocker in active_blockers):
return "post_reboot_service_readiness_blocked"
if any("stockplatform" in blocker or "product_data" in blocker for blocker in active_blockers):
return "product_data_freshness_blocked"
if any("backup" in blocker for blocker in active_blockers):
return "backup_readback_blocked"
if "local_disk_free_below_minimum" in active_blockers:
return "host_capacity_blocked"
if any(blocker.startswith("host_") or blocker.startswith("awooop_") for blocker in active_blockers):
return "host_pressure_blocked"
return "slo_blocked"
def reboot_sop_primary_blocker(active_blockers: list[str]) -> str:
priority = [
"reboot_event_required_host_unreachable",
"host_unreachable_after_reboot",
"all_host_reboot_detection_missing",
"stateful_reboot_event_detection_missing",
"host_boot_probe_missing_hosts",
"fresh_all_host_reboot_event_missing",
"all_required_hosts_not_in_10_minute_reboot_window",
"host_boot_observation_older_than_target_window",
"host_uptime_unknown",
"post_start_blocked_not_zero",
"service_green_not_1",
"host_188_service_green_not_1",
"product_data_green_not_1",
"stockplatform_freshness_blocked",
"stockplatform_ingestion_blocked",
"backup_core_green_not_1",
"local_disk_free_below_minimum",
"wazuh_dashboard_degraded",
]
for blocker in priority:
if blocker in active_blockers:
return blocker
return active_blockers[0] if active_blockers else ""
def reboot_sop_eta_or_wait_reason(
payload: dict[str, Any],
active_blockers: list[str],
current_phase: str,
primary_blocker: str,
) -> str:
if current_phase == "slo_ready":
return "recovered_within_10_minute_slo"
sla_eta = payload.get("sla_recovery_eta")
if not isinstance(sla_eta, dict):
sla_eta = {}
reboot_event = payload.get("reboot_event_detection")
if not isinstance(reboot_event, dict):
reboot_event = {}
remaining = int_value(sla_eta.get("target_seconds_remaining"), 0)
deadline_status = str(sla_eta.get("deadline_status") or "unknown")
if remaining > 0:
return f"target_window_remaining_{remaining}s_but_blocked_by_{primary_blocker or 'unknown'}"
if reboot_event.get("readback_present") is not True:
return "reboot_event_readback_missing_eta_unavailable"
if deadline_status == "target_window_elapsed":
return (
"target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_"
"event_and_probe"
)
if active_blockers:
return f"eta_unavailable_until_primary_blocker_clears:{primary_blocker}"
return "eta_unavailable"
def build_reboot_sop_progress(
payload: dict[str, Any],
active_blockers: list[str],
readiness_percent: int,
) -> dict[str, Any]:
can_claim = payload.get("can_claim_all_services_recovered_within_target") is True
current_phase = reboot_sop_current_phase(active_blockers, can_claim)
primary_blocker = reboot_sop_primary_blocker(active_blockers)
eta_or_wait_reason = reboot_sop_eta_or_wait_reason(
payload,
active_blockers,
current_phase,
primary_blocker,
)
sla_eta = payload.get("sla_recovery_eta")
if not isinstance(sla_eta, dict):
sla_eta = {}
return {
"current_phase": current_phase,
"eta_or_wait_reason": eta_or_wait_reason,
"primary_blocker": primary_blocker,
"active_blockers": active_blockers,
"active_blocker_count": len(active_blockers),
"readiness_percent": readiness_percent,
"next_safe_action": str(payload.get("safe_next_step") or ""),
"fixed_triage_order": strings(sla_eta.get("fixed_triage_order")),
}
def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
"""Add machine-readable P0-006 readback fields to the source scorecard."""
active_blockers = strings(payload.get("active_blockers"))
@@ -671,6 +796,7 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
blocked_by_fresh_reboot_window_only = active_blockers == [
"host_boot_observation_older_than_target_window"
]
sop_progress = build_reboot_sop_progress(payload, active_blockers, readiness_percent)
source_control_ready_count = sum(1 for value in controls.values() if value)
source_controls_present = (
bool(controls) and source_control_ready_count == len(controls)
@@ -738,8 +864,12 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
"workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
"source_id": "reboot_auto_recovery_slo_scorecard",
"status": str(payload.get("status") or "unknown"),
"current_phase": sop_progress["current_phase"],
"eta_or_wait_reason": sop_progress["eta_or_wait_reason"],
"target_minutes": int_value(payload.get("target_minutes")),
"safe_next_step": str(payload.get("safe_next_step") or ""),
"next_safe_action": sop_progress["next_safe_action"],
"primary_blocker": sop_progress["primary_blocker"],
"active_blockers": active_blockers,
"active_blocker_count": len(active_blockers),
"readiness_percent": readiness_percent,
@@ -755,6 +885,11 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
summary = {
"reboot_auto_recovery_status": str(payload.get("status") or "unknown"),
"reboot_auto_recovery_workplan_id": "P0-006",
"reboot_auto_recovery_current_phase": sop_progress["current_phase"],
"reboot_auto_recovery_eta_or_wait_reason": sop_progress[
"eta_or_wait_reason"
],
"reboot_auto_recovery_primary_blocker": sop_progress["primary_blocker"],
"reboot_auto_recovery_readiness_percent": readiness_percent,
"reboot_auto_recovery_active_blocker_count": len(active_blockers),
"reboot_auto_recovery_can_claim_slo": (
@@ -775,6 +910,7 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
"stockplatform_ingestion_status"
],
"reboot_auto_recovery_safe_next_step": readback["safe_next_step"],
"reboot_auto_recovery_next_safe_action": readback["next_safe_action"],
"reboot_auto_recovery_source_controls_present": source_controls_present,
"secret_values_collected": False,
"github_api_used": False,
@@ -783,6 +919,11 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
}
payload["required_checks"] = required_checks
payload["current_phase"] = sop_progress["current_phase"]
payload["eta_or_wait_reason"] = sop_progress["eta_or_wait_reason"]
payload["primary_blocker"] = sop_progress["primary_blocker"]
payload["next_safe_action"] = sop_progress["next_safe_action"]
payload["reboot_sop_progress"] = sop_progress
payload["readback"] = readback
payload["rollups"] = rollups
payload["summary"] = summary

View File

@@ -169,13 +169,27 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) -
assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1"
assert payload["status"] == "slo_ready"
assert payload["can_claim_all_services_recovered_within_target"] is True
assert payload["current_phase"] == "slo_ready"
assert payload["eta_or_wait_reason"] == "recovered_within_10_minute_slo"
assert payload["primary_blocker"] == ""
assert payload["next_safe_action"] == payload["safe_next_step"]
assert payload["reboot_sop_progress"]["current_phase"] == "slo_ready"
assert payload["reboot_sop_progress"]["active_blocker_count"] == 0
assert payload["readback"]["workplan_id"] == "P0-006"
assert payload["readback"]["current_phase"] == "slo_ready"
assert payload["readback"]["eta_or_wait_reason"] == "recovered_within_10_minute_slo"
assert payload["readback"]["next_safe_action"] == payload["safe_next_step"]
assert payload["readback"]["readiness_percent"] == 100
assert payload["readback"]["active_blocker_count"] == 0
assert payload["readback"]["runtime_write_authorized_by_this_scorecard"] is False
assert payload["rollups"]["source_controls_present"] is True
assert payload["rollups"]["readiness_percent"] == 100
assert payload["summary"]["reboot_auto_recovery_workplan_id"] == "P0-006"
assert payload["summary"]["reboot_auto_recovery_current_phase"] == "slo_ready"
assert (
payload["summary"]["reboot_auto_recovery_eta_or_wait_reason"]
== "recovered_within_10_minute_slo"
)
assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is True
assert payload["summary"]["runtime_write_authorized"] is False
assert payload["source_controls"][
@@ -221,10 +235,17 @@ def test_services_green_but_old_boot_window_waits_for_reboot_event(tmp_path: Pat
assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready"
assert payload["active_blockers"] == ["host_boot_observation_older_than_target_window"]
assert payload["current_phase"] == "awaiting_next_reboot_or_approved_drill"
assert (
payload["eta_or_wait_reason"]
== "target_window_remaining_450s_but_blocked_by_host_boot_observation_older_than_target_window"
)
assert payload["primary_blocker"] == "host_boot_observation_older_than_target_window"
assert payload["safe_next_step"] == (
"timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_"
"event_or_approved_reboot_drill_to_prove_10_minute_slo"
)
assert payload["next_safe_action"] == payload["safe_next_step"]
def test_stockplatform_blocked_before_final_retry_waits_for_readback(tmp_path: Path) -> None: