fix(agents): reconcile reboot metric source blockers
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 55s
CD Pipeline / build-and-deploy (push) Successful in 4m56s
AI 技術雷達監控 / ai-technology-watch (push) Successful in 42s
CD Pipeline / post-deploy-checks (push) Successful in 1m52s

This commit is contained in:
Your Name
2026-07-03 01:57:11 +08:00
parent b994a87558
commit 1392369e56
2 changed files with 108 additions and 13 deletions

View File

@@ -55,6 +55,11 @@ _PUBLIC_MAINTENANCE_BLOCKERS = {
"public_route_raw_5xx_without_maintenance_fallback",
"public_route_unreachable_without_external_l1_fallback",
}
_PROMETHEUS_SOURCE_CONTROLLED_BLOCKERS = {
"conversation_event_hot_path_index_migration_source_missing": (
"conversation_event_hot_path_index_migration_source_present"
),
}
def load_latest_reboot_auto_recovery_slo_scorecard(
@@ -338,6 +343,8 @@ def _annotate_prometheus_metric_readback(
payload["runtime_metric_last_run_timestamp"] = _int(
metric_readback.get("last_run_timestamp")
)
payload["runtime_metric_source_control_reconciled_blockers"] = []
payload["runtime_metric_source_control_reconciled_blocker_count"] = 0
readback = _dict(payload.setdefault("readback", {}))
readback["runtime_metric_readback_present"] = present
@@ -348,6 +355,8 @@ def _annotate_prometheus_metric_readback(
readback["runtime_metric_last_run_timestamp"] = payload[
"runtime_metric_last_run_timestamp"
]
readback["runtime_metric_source_control_reconciled_blockers"] = []
readback["runtime_metric_source_control_reconciled_blocker_count"] = 0
rollups = _dict(payload.setdefault("rollups", {}))
rollups["runtime_metric_readback_present"] = present
@@ -357,6 +366,39 @@ def _annotate_prometheus_metric_readback(
rollups["runtime_metric_last_run_timestamp"] = payload[
"runtime_metric_last_run_timestamp"
]
rollups["runtime_metric_source_control_reconciled_blocker_count"] = 0
def _reconcile_prometheus_metric_active_blockers_with_source_controls(
payload: dict[str, Any],
active_blockers: list[str],
) -> list[str]:
source_controls = _dict(payload.get("source_controls"))
reconciled: list[str] = []
removed: list[str] = []
for blocker in active_blockers:
source_control_key = _PROMETHEUS_SOURCE_CONTROLLED_BLOCKERS.get(blocker)
if source_control_key and source_controls.get(source_control_key) is True:
removed.append(blocker)
continue
reconciled.append(blocker)
removed = _unique_strings(removed)
if removed:
payload["runtime_metric_source_control_reconciled_blockers"] = removed
payload["runtime_metric_source_control_reconciled_blocker_count"] = len(removed)
readback = _dict(payload.setdefault("readback", {}))
readback["runtime_metric_source_control_reconciled_blockers"] = removed
readback["runtime_metric_source_control_reconciled_blocker_count"] = len(removed)
rollups = _dict(payload.setdefault("rollups", {}))
rollups["runtime_metric_source_control_reconciled_blocker_count"] = len(
removed
)
return _unique_strings(reconciled)
def _apply_prometheus_metric_active_blockers(
@@ -368,6 +410,10 @@ def _apply_prometheus_metric_active_blockers(
active_blockers = _strings(metric_readback.get("active_blockers"))
if not active_blockers and _int(metric_readback.get("active_blocker_count")):
return
active_blockers = _reconcile_prometheus_metric_active_blockers_with_source_controls(
payload,
active_blockers,
)
can_claim_slo = metric_readback.get("ready") is True and not active_blockers
primary_blocker = str(

View File

@@ -89,6 +89,14 @@ PROMETHEUS_RUNTIME_BLOCKERS = [
"reboot_event_required_host_unreachable",
"windows99_vmware_autostart_readback_missing",
]
PROMETHEUS_SOURCE_RECONCILED_BLOCKER = (
"conversation_event_hot_path_index_migration_source_missing"
)
PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS = [
blocker
for blocker in PROMETHEUS_RUNTIME_BLOCKERS
if blocker != PROMETHEUS_SOURCE_RECONCILED_BLOCKER
]
PROMETHEUS_RUNTIME_READBACK = {
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
@@ -258,8 +266,13 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
assert payload["runtime_metric_readback_present"] is True
assert payload["runtime_metric_active_blocker_count"] == 7
assert payload["runtime_metric_last_run_timestamp"] == 1783010479
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
assert payload["active_blocker_count"] == 7
assert (
payload["runtime_metric_source_control_reconciled_blockers"]
== [PROMETHEUS_SOURCE_RECONCILED_BLOCKER]
)
assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 1
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS
assert payload["active_blocker_count"] == 6
assert payload["readiness_percent"] == 47
assert payload["primary_blocker"] == "reboot_event_required_host_unreachable"
assert payload["next_safe_action"] == (
@@ -267,14 +280,57 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
)
assert "backup_core_green_not_1" not in payload["active_blockers"]
assert "service_green_not_1" not in payload["active_blockers"]
assert payload["active_blocker_action_matrix"]["item_count"] == 7
assert PROMETHEUS_SOURCE_RECONCILED_BLOCKER not in payload["active_blockers"]
assert payload["active_blocker_action_matrix"]["item_count"] == 6
action_by_blocker = {
item["blocker"]: item
for item in payload["active_blocker_action_matrix"]["items"]
}
hot_path_source_action = action_by_blocker[
"conversation_event_hot_path_index_migration_source_missing"
]
assert PROMETHEUS_SOURCE_RECONCILED_BLOCKER not in action_by_blocker
assert payload["active_blocker_action_matrix"]["category_counts"].get(
"host_cpu_pressure",
0,
) == 0
assert payload["readback"]["active_blocker_count"] == 6
assert payload["readback"]["runtime_metric_readback_present"] is True
assert payload["readback"][
"runtime_metric_source_control_reconciled_blocker_count"
] == 1
assert payload["rollups"]["active_blocker_count"] == 6
assert payload["rollups"]["runtime_metric_readback_present"] is True
assert payload["rollups"][
"runtime_metric_source_control_reconciled_blocker_count"
] == 1
assert payload["rollups"]["primary_blocker_owner_lane"] == (
"reboot_event_detector_and_host_probe"
)
def test_reboot_auto_recovery_slo_scorecard_keeps_prometheus_source_missing_when_source_control_missing(
tmp_path,
):
scorecard = json.loads(_SOURCE_SCORECARD.read_text(encoding="utf-8"))
scorecard["source_controls"][
"conversation_event_hot_path_index_migration_source_present"
] = False
(tmp_path / _SOURCE_SCORECARD.name).write_text(
json.dumps(scorecard),
encoding="utf-8",
)
payload = load_latest_reboot_auto_recovery_slo_scorecard(
operations_dir=tmp_path,
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK,
)
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
assert payload["active_blocker_count"] == 7
assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 0
action_by_blocker = {
item["blocker"]: item
for item in payload["active_blocker_action_matrix"]["items"]
}
hot_path_source_action = action_by_blocker[PROMETHEUS_SOURCE_RECONCILED_BLOCKER]
assert hot_path_source_action["category"] == "host_cpu_pressure"
assert hot_path_source_action["owner_lane"] == "host_pressure_controller"
assert hot_path_source_action["evidence_inputs"] == [
@@ -288,13 +344,6 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
assert payload["active_blocker_action_matrix"]["category_counts"][
"host_cpu_pressure"
] == 1
assert payload["readback"]["active_blocker_count"] == 7
assert payload["readback"]["runtime_metric_readback_present"] is True
assert payload["rollups"]["active_blocker_count"] == 7
assert payload["rollups"]["runtime_metric_readback_present"] is True
assert payload["rollups"]["primary_blocker_owner_lane"] == (
"reboot_event_detector_and_host_probe"
)
def test_reboot_slo_prometheus_readback_uses_single_combined_query(monkeypatch):