fix(reboot): expose fixed recovery sop progress
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 51s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
AI 技術雷達監控 / ai-technology-watch (push) Successful in 38s
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 51s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
AI 技術雷達監控 / ai-technology-watch (push) Successful in 38s
This commit is contained in:
@@ -635,6 +635,131 @@ def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]:
|
||||
}
|
||||
|
||||
|
||||
def reboot_sop_current_phase(active_blockers: list[str], can_claim: bool) -> str:
|
||||
if can_claim and not active_blockers:
|
||||
return "slo_ready"
|
||||
if active_blockers == ["host_boot_observation_older_than_target_window"]:
|
||||
return "awaiting_next_reboot_or_approved_drill"
|
||||
host_boot_blockers = {
|
||||
"all_host_reboot_detection_missing",
|
||||
"stateful_reboot_event_detection_missing",
|
||||
"host_boot_probe_missing_hosts",
|
||||
"host_unreachable_after_reboot",
|
||||
"host_boot_observation_older_than_target_window",
|
||||
"host_uptime_unknown",
|
||||
"reboot_event_missing_required_hosts",
|
||||
"reboot_event_required_host_unreachable",
|
||||
"fresh_all_host_reboot_event_missing",
|
||||
"all_required_hosts_not_in_10_minute_reboot_window",
|
||||
}
|
||||
if any(blocker in host_boot_blockers for blocker in active_blockers):
|
||||
return "host_boot_detection_blocked"
|
||||
service_blockers = {
|
||||
"post_reboot_summary_missing",
|
||||
"post_start_blocked_not_zero",
|
||||
"service_green_not_1",
|
||||
"host_188_service_green_not_1",
|
||||
"wazuh_dashboard_degraded",
|
||||
}
|
||||
if any(blocker in service_blockers for blocker in active_blockers):
|
||||
return "post_reboot_service_readiness_blocked"
|
||||
if any("stockplatform" in blocker or "product_data" in blocker for blocker in active_blockers):
|
||||
return "product_data_freshness_blocked"
|
||||
if any("backup" in blocker for blocker in active_blockers):
|
||||
return "backup_readback_blocked"
|
||||
if "local_disk_free_below_minimum" in active_blockers:
|
||||
return "host_capacity_blocked"
|
||||
if any(blocker.startswith("host_") or blocker.startswith("awooop_") for blocker in active_blockers):
|
||||
return "host_pressure_blocked"
|
||||
return "slo_blocked"
|
||||
|
||||
|
||||
def reboot_sop_primary_blocker(active_blockers: list[str]) -> str:
|
||||
priority = [
|
||||
"reboot_event_required_host_unreachable",
|
||||
"host_unreachable_after_reboot",
|
||||
"all_host_reboot_detection_missing",
|
||||
"stateful_reboot_event_detection_missing",
|
||||
"host_boot_probe_missing_hosts",
|
||||
"fresh_all_host_reboot_event_missing",
|
||||
"all_required_hosts_not_in_10_minute_reboot_window",
|
||||
"host_boot_observation_older_than_target_window",
|
||||
"host_uptime_unknown",
|
||||
"post_start_blocked_not_zero",
|
||||
"service_green_not_1",
|
||||
"host_188_service_green_not_1",
|
||||
"product_data_green_not_1",
|
||||
"stockplatform_freshness_blocked",
|
||||
"stockplatform_ingestion_blocked",
|
||||
"backup_core_green_not_1",
|
||||
"local_disk_free_below_minimum",
|
||||
"wazuh_dashboard_degraded",
|
||||
]
|
||||
for blocker in priority:
|
||||
if blocker in active_blockers:
|
||||
return blocker
|
||||
return active_blockers[0] if active_blockers else ""
|
||||
|
||||
|
||||
def reboot_sop_eta_or_wait_reason(
|
||||
payload: dict[str, Any],
|
||||
active_blockers: list[str],
|
||||
current_phase: str,
|
||||
primary_blocker: str,
|
||||
) -> str:
|
||||
if current_phase == "slo_ready":
|
||||
return "recovered_within_10_minute_slo"
|
||||
sla_eta = payload.get("sla_recovery_eta")
|
||||
if not isinstance(sla_eta, dict):
|
||||
sla_eta = {}
|
||||
reboot_event = payload.get("reboot_event_detection")
|
||||
if not isinstance(reboot_event, dict):
|
||||
reboot_event = {}
|
||||
remaining = int_value(sla_eta.get("target_seconds_remaining"), 0)
|
||||
deadline_status = str(sla_eta.get("deadline_status") or "unknown")
|
||||
if remaining > 0:
|
||||
return f"target_window_remaining_{remaining}s_but_blocked_by_{primary_blocker or 'unknown'}"
|
||||
if reboot_event.get("readback_present") is not True:
|
||||
return "reboot_event_readback_missing_eta_unavailable"
|
||||
if deadline_status == "target_window_elapsed":
|
||||
return (
|
||||
"target_window_elapsed_eta_unavailable_until_fresh_all_host_reboot_"
|
||||
"event_and_probe"
|
||||
)
|
||||
if active_blockers:
|
||||
return f"eta_unavailable_until_primary_blocker_clears:{primary_blocker}"
|
||||
return "eta_unavailable"
|
||||
|
||||
|
||||
def build_reboot_sop_progress(
|
||||
payload: dict[str, Any],
|
||||
active_blockers: list[str],
|
||||
readiness_percent: int,
|
||||
) -> dict[str, Any]:
|
||||
can_claim = payload.get("can_claim_all_services_recovered_within_target") is True
|
||||
current_phase = reboot_sop_current_phase(active_blockers, can_claim)
|
||||
primary_blocker = reboot_sop_primary_blocker(active_blockers)
|
||||
eta_or_wait_reason = reboot_sop_eta_or_wait_reason(
|
||||
payload,
|
||||
active_blockers,
|
||||
current_phase,
|
||||
primary_blocker,
|
||||
)
|
||||
sla_eta = payload.get("sla_recovery_eta")
|
||||
if not isinstance(sla_eta, dict):
|
||||
sla_eta = {}
|
||||
return {
|
||||
"current_phase": current_phase,
|
||||
"eta_or_wait_reason": eta_or_wait_reason,
|
||||
"primary_blocker": primary_blocker,
|
||||
"active_blockers": active_blockers,
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"readiness_percent": readiness_percent,
|
||||
"next_safe_action": str(payload.get("safe_next_step") or ""),
|
||||
"fixed_triage_order": strings(sla_eta.get("fixed_triage_order")),
|
||||
}
|
||||
|
||||
|
||||
def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Add machine-readable P0-006 readback fields to the source scorecard."""
|
||||
active_blockers = strings(payload.get("active_blockers"))
|
||||
@@ -671,6 +796,7 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
blocked_by_fresh_reboot_window_only = active_blockers == [
|
||||
"host_boot_observation_older_than_target_window"
|
||||
]
|
||||
sop_progress = build_reboot_sop_progress(payload, active_blockers, readiness_percent)
|
||||
source_control_ready_count = sum(1 for value in controls.values() if value)
|
||||
source_controls_present = (
|
||||
bool(controls) and source_control_ready_count == len(controls)
|
||||
@@ -738,8 +864,12 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
"workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
|
||||
"source_id": "reboot_auto_recovery_slo_scorecard",
|
||||
"status": str(payload.get("status") or "unknown"),
|
||||
"current_phase": sop_progress["current_phase"],
|
||||
"eta_or_wait_reason": sop_progress["eta_or_wait_reason"],
|
||||
"target_minutes": int_value(payload.get("target_minutes")),
|
||||
"safe_next_step": str(payload.get("safe_next_step") or ""),
|
||||
"next_safe_action": sop_progress["next_safe_action"],
|
||||
"primary_blocker": sop_progress["primary_blocker"],
|
||||
"active_blockers": active_blockers,
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"readiness_percent": readiness_percent,
|
||||
@@ -755,6 +885,11 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
summary = {
|
||||
"reboot_auto_recovery_status": str(payload.get("status") or "unknown"),
|
||||
"reboot_auto_recovery_workplan_id": "P0-006",
|
||||
"reboot_auto_recovery_current_phase": sop_progress["current_phase"],
|
||||
"reboot_auto_recovery_eta_or_wait_reason": sop_progress[
|
||||
"eta_or_wait_reason"
|
||||
],
|
||||
"reboot_auto_recovery_primary_blocker": sop_progress["primary_blocker"],
|
||||
"reboot_auto_recovery_readiness_percent": readiness_percent,
|
||||
"reboot_auto_recovery_active_blocker_count": len(active_blockers),
|
||||
"reboot_auto_recovery_can_claim_slo": (
|
||||
@@ -775,6 +910,7 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
"stockplatform_ingestion_status"
|
||||
],
|
||||
"reboot_auto_recovery_safe_next_step": readback["safe_next_step"],
|
||||
"reboot_auto_recovery_next_safe_action": readback["next_safe_action"],
|
||||
"reboot_auto_recovery_source_controls_present": source_controls_present,
|
||||
"secret_values_collected": False,
|
||||
"github_api_used": False,
|
||||
@@ -783,6 +919,11 @@ def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
}
|
||||
|
||||
payload["required_checks"] = required_checks
|
||||
payload["current_phase"] = sop_progress["current_phase"]
|
||||
payload["eta_or_wait_reason"] = sop_progress["eta_or_wait_reason"]
|
||||
payload["primary_blocker"] = sop_progress["primary_blocker"]
|
||||
payload["next_safe_action"] = sop_progress["next_safe_action"]
|
||||
payload["reboot_sop_progress"] = sop_progress
|
||||
payload["readback"] = readback
|
||||
payload["rollups"] = rollups
|
||||
payload["summary"] = summary
|
||||
|
||||
@@ -169,13 +169,27 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) -
|
||||
assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1"
|
||||
assert payload["status"] == "slo_ready"
|
||||
assert payload["can_claim_all_services_recovered_within_target"] is True
|
||||
assert payload["current_phase"] == "slo_ready"
|
||||
assert payload["eta_or_wait_reason"] == "recovered_within_10_minute_slo"
|
||||
assert payload["primary_blocker"] == ""
|
||||
assert payload["next_safe_action"] == payload["safe_next_step"]
|
||||
assert payload["reboot_sop_progress"]["current_phase"] == "slo_ready"
|
||||
assert payload["reboot_sop_progress"]["active_blocker_count"] == 0
|
||||
assert payload["readback"]["workplan_id"] == "P0-006"
|
||||
assert payload["readback"]["current_phase"] == "slo_ready"
|
||||
assert payload["readback"]["eta_or_wait_reason"] == "recovered_within_10_minute_slo"
|
||||
assert payload["readback"]["next_safe_action"] == payload["safe_next_step"]
|
||||
assert payload["readback"]["readiness_percent"] == 100
|
||||
assert payload["readback"]["active_blocker_count"] == 0
|
||||
assert payload["readback"]["runtime_write_authorized_by_this_scorecard"] is False
|
||||
assert payload["rollups"]["source_controls_present"] is True
|
||||
assert payload["rollups"]["readiness_percent"] == 100
|
||||
assert payload["summary"]["reboot_auto_recovery_workplan_id"] == "P0-006"
|
||||
assert payload["summary"]["reboot_auto_recovery_current_phase"] == "slo_ready"
|
||||
assert (
|
||||
payload["summary"]["reboot_auto_recovery_eta_or_wait_reason"]
|
||||
== "recovered_within_10_minute_slo"
|
||||
)
|
||||
assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is True
|
||||
assert payload["summary"]["runtime_write_authorized"] is False
|
||||
assert payload["source_controls"][
|
||||
@@ -221,10 +235,17 @@ def test_services_green_but_old_boot_window_waits_for_reboot_event(tmp_path: Pat
|
||||
|
||||
assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready"
|
||||
assert payload["active_blockers"] == ["host_boot_observation_older_than_target_window"]
|
||||
assert payload["current_phase"] == "awaiting_next_reboot_or_approved_drill"
|
||||
assert (
|
||||
payload["eta_or_wait_reason"]
|
||||
== "target_window_remaining_450s_but_blocked_by_host_boot_observation_older_than_target_window"
|
||||
)
|
||||
assert payload["primary_blocker"] == "host_boot_observation_older_than_target_window"
|
||||
assert payload["safe_next_step"] == (
|
||||
"timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_"
|
||||
"event_or_approved_reboot_drill_to_prove_10_minute_slo"
|
||||
)
|
||||
assert payload["next_safe_action"] == payload["safe_next_step"]
|
||||
|
||||
|
||||
def test_stockplatform_blocked_before_final_retry_waits_for_readback(tmp_path: Path) -> None:
|
||||
|
||||
Reference in New Issue
Block a user