fix(reboot): preserve service backup metric blockers
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m6s
CD Pipeline / build-and-deploy (push) Successful in 5m6s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Successful in 23s
CD Pipeline / post-deploy-checks (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-03 03:14:54 +08:00
parent 2fb982e405
commit 4fe0b903ff
2 changed files with 102 additions and 9 deletions

View File

@@ -68,6 +68,14 @@ _RUNTIME_BOOLEAN_READY_BLOCKERS = {
"product_data_green_not_1": "product_data_green",
"service_green_not_1": "service_green",
}
_SERVICE_DATA_BACKUP_BLOCKING_FIELD_BLOCKERS = {
"backup_core_green": "backup_core_green_not_1",
"host_188_service_green": "host_188_service_green_not_1",
"post_start_blocked": "post_start_blocked_not_zero",
"product_data_green": "product_data_green_not_1",
"service_green": "service_green_not_1",
"wazuh_dashboard_degraded": "wazuh_dashboard_degraded",
}
_PROMETHEUS_SOURCE_CONTROLLED_BLOCKERS = {
"conversation_event_hot_path_index_migration_source_missing": (
"conversation_event_hot_path_index_migration_source_present"
@@ -421,6 +429,8 @@ def _annotate_prometheus_metric_readback(
payload["runtime_metric_source_control_reconciled_blocker_count"] = 0
payload["runtime_metric_runtime_readback_reconciled_blockers"] = []
payload["runtime_metric_runtime_readback_reconciled_blocker_count"] = 0
payload["runtime_metric_runtime_readback_added_blockers"] = []
payload["runtime_metric_runtime_readback_added_blocker_count"] = 0
readback = _dict(payload.setdefault("readback", {}))
readback["runtime_metric_readback_present"] = present
@@ -435,6 +445,8 @@ def _annotate_prometheus_metric_readback(
readback["runtime_metric_source_control_reconciled_blocker_count"] = 0
readback["runtime_metric_runtime_readback_reconciled_blockers"] = []
readback["runtime_metric_runtime_readback_reconciled_blocker_count"] = 0
readback["runtime_metric_runtime_readback_added_blockers"] = []
readback["runtime_metric_runtime_readback_added_blocker_count"] = 0
rollups = _dict(payload.setdefault("rollups", {}))
rollups["runtime_metric_readback_present"] = present
@@ -446,6 +458,7 @@ def _annotate_prometheus_metric_readback(
]
rollups["runtime_metric_source_control_reconciled_blocker_count"] = 0
rollups["runtime_metric_runtime_readback_reconciled_blocker_count"] = 0
rollups["runtime_metric_runtime_readback_added_blocker_count"] = 0
_apply_prometheus_windows99_vmware_readback(payload, metric_readback)
@@ -634,6 +647,44 @@ def _reconcile_prometheus_metric_active_blockers_with_runtime_readbacks(
return _unique_strings(reconciled)
def _add_runtime_readback_active_blockers_missing_from_metric(
payload: dict[str, Any],
active_blockers: list[str],
) -> list[str]:
service_backup = _dict(payload.get("controlled_service_data_backup_readback"))
blocking_fields = _strings(service_backup.get("blocking_fields"))
added = _unique_strings(
[
_SERVICE_DATA_BACKUP_BLOCKING_FIELD_BLOCKERS[field]
for field in blocking_fields
if field in _SERVICE_DATA_BACKUP_BLOCKING_FIELD_BLOCKERS
]
)
if not added:
return _unique_strings(active_blockers)
merged = _unique_strings([*active_blockers, *added])
actually_added = [blocker for blocker in added if blocker not in active_blockers]
if actually_added:
payload["runtime_metric_runtime_readback_added_blockers"] = actually_added
payload["runtime_metric_runtime_readback_added_blocker_count"] = len(
actually_added
)
readback = _dict(payload.setdefault("readback", {}))
readback["runtime_metric_runtime_readback_added_blockers"] = actually_added
readback["runtime_metric_runtime_readback_added_blocker_count"] = len(
actually_added
)
rollups = _dict(payload.setdefault("rollups", {}))
rollups["runtime_metric_runtime_readback_added_blocker_count"] = len(
actually_added
)
return merged
def _apply_prometheus_metric_active_blockers(
payload: dict[str, Any],
metric_readback: dict[str, Any],
@@ -651,6 +702,10 @@ def _apply_prometheus_metric_active_blockers(
payload,
active_blockers,
)
active_blockers = _add_runtime_readback_active_blockers_missing_from_metric(
payload,
active_blockers,
)
can_claim_slo = metric_readback.get("ready") is True and not active_blockers
primary_blocker = str(

View File

@@ -331,6 +331,16 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
payload = load_latest_reboot_auto_recovery_slo_scorecard(
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK
)
expected_runtime_readback_added_blockers = [
"service_green_not_1",
"post_start_blocked_not_zero",
"backup_core_green_not_1",
"wazuh_dashboard_degraded",
]
expected_active_blockers = [
*PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS,
*expected_runtime_readback_added_blockers,
]
assert payload["runtime_scorecard_readback_present"] is False
assert payload["runtime_metric_readback_present"] is True
@@ -341,17 +351,21 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
== [PROMETHEUS_SOURCE_RECONCILED_BLOCKER]
)
assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 1
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS
assert payload["active_blocker_count"] == 7
assert payload["active_blockers"] == expected_active_blockers
assert payload["active_blocker_count"] == 11
assert payload["readiness_percent"] == 47
assert payload["primary_blocker"] == "reboot_event_required_host_unreachable"
assert payload["next_safe_action"] == (
"rerun_reboot_event_detector_and_host_probe_verify_only_no_reboot"
)
assert "backup_core_green_not_1" not in payload["active_blockers"]
assert "service_green_not_1" not in payload["active_blockers"]
assert "backup_core_green_not_1" in payload["active_blockers"]
assert "service_green_not_1" in payload["active_blockers"]
assert PROMETHEUS_SOURCE_RECONCILED_BLOCKER not in payload["active_blockers"]
assert payload["active_blocker_action_matrix"]["item_count"] == 7
assert payload["runtime_metric_runtime_readback_added_blockers"] == (
expected_runtime_readback_added_blockers
)
assert payload["runtime_metric_runtime_readback_added_blocker_count"] == 4
assert payload["active_blocker_action_matrix"]["item_count"] == 11
assert payload["windows99_vmware_autostart"]["readback_present"] is True
assert payload["windows99_vmware_autostart"]["missing_vmx_aliases"] == ["111"]
assert payload["windows99_vmware_autostart"]["powered_off_aliases"] == [
@@ -401,16 +415,34 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
"host_cpu_pressure",
0,
) == 0
assert payload["readback"]["active_blocker_count"] == 7
assert payload["active_blocker_action_matrix"]["category_counts"][
"post_reboot_service_readiness"
] == 2
assert payload["active_blocker_action_matrix"]["category_counts"][
"backup_observability"
] == 1
assert payload["active_blocker_action_matrix"]["category_counts"][
"security_observability"
] == 1
assert payload["readback"]["active_blocker_count"] == 11
assert payload["readback"]["runtime_metric_readback_present"] is True
assert payload["readback"][
"runtime_metric_source_control_reconciled_blocker_count"
] == 1
assert payload["rollups"]["active_blocker_count"] == 7
assert payload["readback"][
"runtime_metric_runtime_readback_added_blockers"
] == expected_runtime_readback_added_blockers
assert payload["readback"][
"runtime_metric_runtime_readback_added_blocker_count"
] == 4
assert payload["rollups"]["active_blocker_count"] == 11
assert payload["rollups"]["runtime_metric_readback_present"] is True
assert payload["rollups"][
"runtime_metric_source_control_reconciled_blocker_count"
] == 1
assert payload["rollups"][
"runtime_metric_runtime_readback_added_blocker_count"
] == 4
assert payload["rollups"]["primary_blocker_owner_lane"] == (
"reboot_event_detector_and_host_probe"
)
@@ -433,8 +465,14 @@ def test_reboot_auto_recovery_slo_scorecard_keeps_prometheus_source_missing_when
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK,
)
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
assert payload["active_blocker_count"] == 8
assert payload["active_blockers"] == [
*PROMETHEUS_RUNTIME_BLOCKERS,
"service_green_not_1",
"post_start_blocked_not_zero",
"backup_core_green_not_1",
"wazuh_dashboard_degraded",
]
assert payload["active_blocker_count"] == 12
assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 0
action_by_blocker = {
item["blocker"]: item