fix(agents): reconcile reboot metric source blockers
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 55s
CD Pipeline / build-and-deploy (push) Successful in 4m56s
AI 技術雷達監控 / ai-technology-watch (push) Successful in 42s
CD Pipeline / post-deploy-checks (push) Successful in 1m52s
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 55s
CD Pipeline / build-and-deploy (push) Successful in 4m56s
AI 技術雷達監控 / ai-technology-watch (push) Successful in 42s
CD Pipeline / post-deploy-checks (push) Successful in 1m52s
This commit is contained in:
@@ -55,6 +55,11 @@ _PUBLIC_MAINTENANCE_BLOCKERS = {
|
||||
"public_route_raw_5xx_without_maintenance_fallback",
|
||||
"public_route_unreachable_without_external_l1_fallback",
|
||||
}
|
||||
_PROMETHEUS_SOURCE_CONTROLLED_BLOCKERS = {
|
||||
"conversation_event_hot_path_index_migration_source_missing": (
|
||||
"conversation_event_hot_path_index_migration_source_present"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def load_latest_reboot_auto_recovery_slo_scorecard(
|
||||
@@ -338,6 +343,8 @@ def _annotate_prometheus_metric_readback(
|
||||
payload["runtime_metric_last_run_timestamp"] = _int(
|
||||
metric_readback.get("last_run_timestamp")
|
||||
)
|
||||
payload["runtime_metric_source_control_reconciled_blockers"] = []
|
||||
payload["runtime_metric_source_control_reconciled_blocker_count"] = 0
|
||||
|
||||
readback = _dict(payload.setdefault("readback", {}))
|
||||
readback["runtime_metric_readback_present"] = present
|
||||
@@ -348,6 +355,8 @@ def _annotate_prometheus_metric_readback(
|
||||
readback["runtime_metric_last_run_timestamp"] = payload[
|
||||
"runtime_metric_last_run_timestamp"
|
||||
]
|
||||
readback["runtime_metric_source_control_reconciled_blockers"] = []
|
||||
readback["runtime_metric_source_control_reconciled_blocker_count"] = 0
|
||||
|
||||
rollups = _dict(payload.setdefault("rollups", {}))
|
||||
rollups["runtime_metric_readback_present"] = present
|
||||
@@ -357,6 +366,39 @@ def _annotate_prometheus_metric_readback(
|
||||
rollups["runtime_metric_last_run_timestamp"] = payload[
|
||||
"runtime_metric_last_run_timestamp"
|
||||
]
|
||||
rollups["runtime_metric_source_control_reconciled_blocker_count"] = 0
|
||||
|
||||
|
||||
def _reconcile_prometheus_metric_active_blockers_with_source_controls(
|
||||
payload: dict[str, Any],
|
||||
active_blockers: list[str],
|
||||
) -> list[str]:
|
||||
source_controls = _dict(payload.get("source_controls"))
|
||||
reconciled: list[str] = []
|
||||
removed: list[str] = []
|
||||
|
||||
for blocker in active_blockers:
|
||||
source_control_key = _PROMETHEUS_SOURCE_CONTROLLED_BLOCKERS.get(blocker)
|
||||
if source_control_key and source_controls.get(source_control_key) is True:
|
||||
removed.append(blocker)
|
||||
continue
|
||||
reconciled.append(blocker)
|
||||
|
||||
removed = _unique_strings(removed)
|
||||
if removed:
|
||||
payload["runtime_metric_source_control_reconciled_blockers"] = removed
|
||||
payload["runtime_metric_source_control_reconciled_blocker_count"] = len(removed)
|
||||
|
||||
readback = _dict(payload.setdefault("readback", {}))
|
||||
readback["runtime_metric_source_control_reconciled_blockers"] = removed
|
||||
readback["runtime_metric_source_control_reconciled_blocker_count"] = len(removed)
|
||||
|
||||
rollups = _dict(payload.setdefault("rollups", {}))
|
||||
rollups["runtime_metric_source_control_reconciled_blocker_count"] = len(
|
||||
removed
|
||||
)
|
||||
|
||||
return _unique_strings(reconciled)
|
||||
|
||||
|
||||
def _apply_prometheus_metric_active_blockers(
|
||||
@@ -368,6 +410,10 @@ def _apply_prometheus_metric_active_blockers(
|
||||
active_blockers = _strings(metric_readback.get("active_blockers"))
|
||||
if not active_blockers and _int(metric_readback.get("active_blocker_count")):
|
||||
return
|
||||
active_blockers = _reconcile_prometheus_metric_active_blockers_with_source_controls(
|
||||
payload,
|
||||
active_blockers,
|
||||
)
|
||||
|
||||
can_claim_slo = metric_readback.get("ready") is True and not active_blockers
|
||||
primary_blocker = str(
|
||||
|
||||
@@ -89,6 +89,14 @@ PROMETHEUS_RUNTIME_BLOCKERS = [
|
||||
"reboot_event_required_host_unreachable",
|
||||
"windows99_vmware_autostart_readback_missing",
|
||||
]
|
||||
PROMETHEUS_SOURCE_RECONCILED_BLOCKER = (
|
||||
"conversation_event_hot_path_index_migration_source_missing"
|
||||
)
|
||||
PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS = [
|
||||
blocker
|
||||
for blocker in PROMETHEUS_RUNTIME_BLOCKERS
|
||||
if blocker != PROMETHEUS_SOURCE_RECONCILED_BLOCKER
|
||||
]
|
||||
|
||||
PROMETHEUS_RUNTIME_READBACK = {
|
||||
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
|
||||
@@ -258,8 +266,13 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
|
||||
assert payload["runtime_metric_readback_present"] is True
|
||||
assert payload["runtime_metric_active_blocker_count"] == 7
|
||||
assert payload["runtime_metric_last_run_timestamp"] == 1783010479
|
||||
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
|
||||
assert payload["active_blocker_count"] == 7
|
||||
assert (
|
||||
payload["runtime_metric_source_control_reconciled_blockers"]
|
||||
== [PROMETHEUS_SOURCE_RECONCILED_BLOCKER]
|
||||
)
|
||||
assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 1
|
||||
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_SOURCE_RECONCILED_BLOCKERS
|
||||
assert payload["active_blocker_count"] == 6
|
||||
assert payload["readiness_percent"] == 47
|
||||
assert payload["primary_blocker"] == "reboot_event_required_host_unreachable"
|
||||
assert payload["next_safe_action"] == (
|
||||
@@ -267,14 +280,57 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
|
||||
)
|
||||
assert "backup_core_green_not_1" not in payload["active_blockers"]
|
||||
assert "service_green_not_1" not in payload["active_blockers"]
|
||||
assert payload["active_blocker_action_matrix"]["item_count"] == 7
|
||||
assert PROMETHEUS_SOURCE_RECONCILED_BLOCKER not in payload["active_blockers"]
|
||||
assert payload["active_blocker_action_matrix"]["item_count"] == 6
|
||||
action_by_blocker = {
|
||||
item["blocker"]: item
|
||||
for item in payload["active_blocker_action_matrix"]["items"]
|
||||
}
|
||||
hot_path_source_action = action_by_blocker[
|
||||
"conversation_event_hot_path_index_migration_source_missing"
|
||||
]
|
||||
assert PROMETHEUS_SOURCE_RECONCILED_BLOCKER not in action_by_blocker
|
||||
assert payload["active_blocker_action_matrix"]["category_counts"].get(
|
||||
"host_cpu_pressure",
|
||||
0,
|
||||
) == 0
|
||||
assert payload["readback"]["active_blocker_count"] == 6
|
||||
assert payload["readback"]["runtime_metric_readback_present"] is True
|
||||
assert payload["readback"][
|
||||
"runtime_metric_source_control_reconciled_blocker_count"
|
||||
] == 1
|
||||
assert payload["rollups"]["active_blocker_count"] == 6
|
||||
assert payload["rollups"]["runtime_metric_readback_present"] is True
|
||||
assert payload["rollups"][
|
||||
"runtime_metric_source_control_reconciled_blocker_count"
|
||||
] == 1
|
||||
assert payload["rollups"]["primary_blocker_owner_lane"] == (
|
||||
"reboot_event_detector_and_host_probe"
|
||||
)
|
||||
|
||||
|
||||
def test_reboot_auto_recovery_slo_scorecard_keeps_prometheus_source_missing_when_source_control_missing(
|
||||
tmp_path,
|
||||
):
|
||||
scorecard = json.loads(_SOURCE_SCORECARD.read_text(encoding="utf-8"))
|
||||
scorecard["source_controls"][
|
||||
"conversation_event_hot_path_index_migration_source_present"
|
||||
] = False
|
||||
(tmp_path / _SOURCE_SCORECARD.name).write_text(
|
||||
json.dumps(scorecard),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
payload = load_latest_reboot_auto_recovery_slo_scorecard(
|
||||
operations_dir=tmp_path,
|
||||
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK,
|
||||
)
|
||||
|
||||
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
|
||||
assert payload["active_blocker_count"] == 7
|
||||
assert payload["runtime_metric_source_control_reconciled_blocker_count"] == 0
|
||||
action_by_blocker = {
|
||||
item["blocker"]: item
|
||||
for item in payload["active_blocker_action_matrix"]["items"]
|
||||
}
|
||||
hot_path_source_action = action_by_blocker[PROMETHEUS_SOURCE_RECONCILED_BLOCKER]
|
||||
assert hot_path_source_action["category"] == "host_cpu_pressure"
|
||||
assert hot_path_source_action["owner_lane"] == "host_pressure_controller"
|
||||
assert hot_path_source_action["evidence_inputs"] == [
|
||||
@@ -288,13 +344,6 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
|
||||
assert payload["active_blocker_action_matrix"]["category_counts"][
|
||||
"host_cpu_pressure"
|
||||
] == 1
|
||||
assert payload["readback"]["active_blocker_count"] == 7
|
||||
assert payload["readback"]["runtime_metric_readback_present"] is True
|
||||
assert payload["rollups"]["active_blocker_count"] == 7
|
||||
assert payload["rollups"]["runtime_metric_readback_present"] is True
|
||||
assert payload["rollups"]["primary_blocker_owner_lane"] == (
|
||||
"reboot_event_detector_and_host_probe"
|
||||
)
|
||||
|
||||
|
||||
def test_reboot_slo_prometheus_readback_uses_single_combined_query(monkeypatch):
|
||||
|
||||
Reference in New Issue
Block a user