From 4fe0b903ff02676997fee7a6053dba3d313d3566 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 3 Jul 2026 03:14:54 +0800 Subject: [PATCH] fix(reboot): preserve service backup metric blockers --- .../reboot_auto_recovery_slo_scorecard.py | 55 ++++++++++++++++++ ..._reboot_auto_recovery_slo_scorecard_api.py | 56 ++++++++++++++++--- 2 files changed, 102 insertions(+), 9 deletions(-) diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 7f699670e..f2eec9c8e 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -68,6 +68,14 @@ _RUNTIME_BOOLEAN_READY_BLOCKERS = { "product_data_green_not_1": "product_data_green", "service_green_not_1": "service_green", } +_SERVICE_DATA_BACKUP_BLOCKING_FIELD_BLOCKERS = { + "backup_core_green": "backup_core_green_not_1", + "host_188_service_green": "host_188_service_green_not_1", + "post_start_blocked": "post_start_blocked_not_zero", + "product_data_green": "product_data_green_not_1", + "service_green": "service_green_not_1", + "wazuh_dashboard_degraded": "wazuh_dashboard_degraded", +} _PROMETHEUS_SOURCE_CONTROLLED_BLOCKERS = { "conversation_event_hot_path_index_migration_source_missing": ( "conversation_event_hot_path_index_migration_source_present" @@ -421,6 +429,8 @@ def _annotate_prometheus_metric_readback( payload["runtime_metric_source_control_reconciled_blocker_count"] = 0 payload["runtime_metric_runtime_readback_reconciled_blockers"] = [] payload["runtime_metric_runtime_readback_reconciled_blocker_count"] = 0 + payload["runtime_metric_runtime_readback_added_blockers"] = [] + payload["runtime_metric_runtime_readback_added_blocker_count"] = 0 readback = _dict(payload.setdefault("readback", {})) readback["runtime_metric_readback_present"] = present @@ -435,6 +445,8 @@ def _annotate_prometheus_metric_readback( readback["runtime_metric_source_control_reconciled_blocker_count"] = 0 readback["runtime_metric_runtime_readback_reconciled_blockers"] = [] readback["runtime_metric_runtime_readback_reconciled_blocker_count"] = 0 + readback["runtime_metric_runtime_readback_added_blockers"] = [] + readback["runtime_metric_runtime_readback_added_blocker_count"] = 0 rollups = _dict(payload.setdefault("rollups", {})) rollups["runtime_metric_readback_present"] = present @@ -446,6 +458,7 @@ def _annotate_prometheus_metric_readback( ] rollups["runtime_metric_source_control_reconciled_blocker_count"] = 0 rollups["runtime_metric_runtime_readback_reconciled_blocker_count"] = 0 + rollups["runtime_metric_runtime_readback_added_blocker_count"] = 0 _apply_prometheus_windows99_vmware_readback(payload, metric_readback) @@ -634,6 +647,44 @@ def _reconcile_prometheus_metric_active_blockers_with_runtime_readbacks( return _unique_strings(reconciled) +def _add_runtime_readback_active_blockers_missing_from_metric( + payload: dict[str, Any], + active_blockers: list[str], +) -> list[str]: + service_backup = _dict(payload.get("controlled_service_data_backup_readback")) + blocking_fields = _strings(service_backup.get("blocking_fields")) + added = _unique_strings( + [ + _SERVICE_DATA_BACKUP_BLOCKING_FIELD_BLOCKERS[field] + for field in blocking_fields + if field in _SERVICE_DATA_BACKUP_BLOCKING_FIELD_BLOCKERS + ] + ) + if not added: + return _unique_strings(active_blockers) + + merged = _unique_strings([*active_blockers, *added]) + actually_added = [blocker for blocker in added if blocker not in active_blockers] + if actually_added: + payload["runtime_metric_runtime_readback_added_blockers"] = actually_added + payload["runtime_metric_runtime_readback_added_blocker_count"] = len( + actually_added + ) + + readback = _dict(payload.setdefault("readback", {})) + readback["runtime_metric_runtime_readback_added_blockers"] = actually_added + readback["runtime_metric_runtime_readback_added_blocker_count"] = len( + actually_added + ) + + rollups = _dict(payload.setdefault("rollups", {})) + rollups["runtime_metric_runtime_readback_added_blocker_count"] = len( + actually_added + ) + + return merged + + def _apply_prometheus_metric_active_blockers( payload: dict[str, Any], metric_readback: dict[str, Any], @@ -651,6 +702,10 @@ def _apply_prometheus_metric_active_blockers( payload, active_blockers, ) + active_blockers = _add_runtime_readback_active_blockers_missing_from_metric( + payload, + active_blockers, + ) can_claim_slo = metric_readback.get("ready") is True and not active_blockers primary_blocker = str( diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 316c738ac..93ca28811 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -331,6 +331,16 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics( payload = load_latest_reboot_auto_recovery_slo_scorecard( prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK ) + expected_runtime_readback_added_blockers = [ + "service_green_not_1", + "post_start_blocked_not_zero", + "backup_core_green_not_1", + "wazuh_dashboard_degraded", + ] + expected_active_blockers = [ + *PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS, + *expected_runtime_readback_added_blockers, + ] assert payload["runtime_scorecard_readback_present"] is False assert payload["runtime_metric_readback_present"] is True @@ -341,17 +351,21 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics( == [PROMETHEUS_SOURCE_RECONCILED_BLOCKER] ) assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 1 - assert payload["active_blockers"] == PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS - assert payload["active_blocker_count"] == 7 + assert payload["active_blockers"] == expected_active_blockers + assert payload["active_blocker_count"] == 11 assert payload["readiness_percent"] == 47 assert payload["primary_blocker"] == "reboot_event_required_host_unreachable" assert payload["next_safe_action"] == ( "rerun_reboot_event_detector_and_host_probe_verify_only_no_reboot" ) - assert "backup_core_green_not_1" not in payload["active_blockers"] - assert "service_green_not_1" not in payload["active_blockers"] + assert "backup_core_green_not_1" in payload["active_blockers"] + assert "service_green_not_1" in payload["active_blockers"] assert PROMETHEUS_SOURCE_RECONCILED_BLOCKER not in payload["active_blockers"] - assert payload["active_blocker_action_matrix"]["item_count"] == 7 + assert payload["runtime_metric_runtime_readback_added_blockers"] == ( + expected_runtime_readback_added_blockers + ) + assert payload["runtime_metric_runtime_readback_added_blocker_count"] == 4 + assert payload["active_blocker_action_matrix"]["item_count"] == 11 assert payload["windows99_vmware_autostart"]["readback_present"] is True assert payload["windows99_vmware_autostart"]["missing_vmx_aliases"] == ["111"] assert payload["windows99_vmware_autostart"]["powered_off_aliases"] == [ @@ -401,16 +415,34 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics( "host_cpu_pressure", 0, ) == 0 - assert payload["readback"]["active_blocker_count"] == 7 + assert payload["active_blocker_action_matrix"]["category_counts"][ + "post_reboot_service_readiness" + ] == 2 + assert payload["active_blocker_action_matrix"]["category_counts"][ + "backup_observability" + ] == 1 + assert payload["active_blocker_action_matrix"]["category_counts"][ + "security_observability" + ] == 1 + assert payload["readback"]["active_blocker_count"] == 11 assert payload["readback"]["runtime_metric_readback_present"] is True assert payload["readback"][ "runtime_metric_source_control_reconciled_blocker_count" ] == 1 - assert payload["rollups"]["active_blocker_count"] == 7 + assert payload["readback"][ + "runtime_metric_runtime_readback_added_blockers" + ] == expected_runtime_readback_added_blockers + assert payload["readback"][ + "runtime_metric_runtime_readback_added_blocker_count" + ] == 4 + assert payload["rollups"]["active_blocker_count"] == 11 assert payload["rollups"]["runtime_metric_readback_present"] is True assert payload["rollups"][ "runtime_metric_source_control_reconciled_blocker_count" ] == 1 + assert payload["rollups"][ + "runtime_metric_runtime_readback_added_blocker_count" + ] == 4 assert payload["rollups"]["primary_blocker_owner_lane"] == ( "reboot_event_detector_and_host_probe" ) @@ -433,8 +465,14 @@ def test_reboot_auto_recovery_slo_scorecard_keeps_prometheus_source_missing_when prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK, ) - assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS - assert payload["active_blocker_count"] == 8 + assert payload["active_blockers"] == [ + *PROMETHEUS_RUNTIME_BLOCKERS, + "service_green_not_1", + "post_start_blocked_not_zero", + "backup_core_green_not_1", + "wazuh_dashboard_degraded", + ] + assert payload["active_blocker_count"] == 12 assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 0 action_by_blocker = { item["blocker"]: item