diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 18116af0d..91a82ca0a 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -45,6 +45,11 @@ _PROMETHEUS_RUNTIME_QUERIES = { "ready": "awoooi_reboot_auto_recovery_slo_ready", "last_run_timestamp": "awoooi_reboot_auto_recovery_slo_last_run_timestamp", } +_PROMETHEUS_RUNTIME_COMBINED_QUERY = ( + '{__name__=~"' + + "|".join(_PROMETHEUS_RUNTIME_QUERIES.values()) + + '"}' +) _PUBLIC_MAINTENANCE_BLOCKERS = { "public_maintenance_fallback_runtime_readback_missing", "public_route_raw_5xx_without_maintenance_fallback", @@ -213,27 +218,13 @@ def _load_reboot_slo_prometheus_metric_readback( or os.environ.get(_PROMETHEUS_QUERY_URL_ENV, "").strip() or _DEFAULT_PROMETHEUS_QUERY_URL ) - timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=3.0) + timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=1.0) try: - active_results = _query_prometheus_vector( + combined_results = _query_prometheus_vector( query_url, - _PROMETHEUS_RUNTIME_QUERIES["active_blocker"], + _PROMETHEUS_RUNTIME_COMBINED_QUERY, timeout_seconds=timeout_seconds, ) - primary_results = _query_prometheus_vector( - query_url, - _PROMETHEUS_RUNTIME_QUERIES["primary_blocker"], - timeout_seconds=timeout_seconds, - ) - scalar_results = { - key: _query_prometheus_vector( - query_url, - query, - timeout_seconds=timeout_seconds, - ) - for key, query in _PROMETHEUS_RUNTIME_QUERIES.items() - if key not in {"active_blocker", "primary_blocker"} - } except ( OSError, TimeoutError, @@ -249,6 +240,24 @@ def _load_reboot_slo_prometheus_metric_readback( "error_class": exc.__class__.__name__, } + results_by_name: dict[str, list[dict[str, Any]]] = {} + for result in combined_results: + metric_name = str(_dict(result.get("metric")).get("__name__") or "") + if metric_name: + results_by_name.setdefault(metric_name, []).append(result) + active_results = results_by_name.get( + _PROMETHEUS_RUNTIME_QUERIES["active_blocker"], + [], + ) + primary_results = results_by_name.get( + _PROMETHEUS_RUNTIME_QUERIES["primary_blocker"], + [], + ) + scalar_results = { + key: results_by_name.get(query, []) + for key, query in _PROMETHEUS_RUNTIME_QUERIES.items() + if key not in {"active_blocker", "primary_blocker"} + } active_blockers = _unique_strings( [ str(_dict(result.get("metric")).get("blocker") or "") @@ -277,12 +286,7 @@ def _load_reboot_slo_prometheus_metric_readback( ) return { "schema_version": "reboot_slo_prometheus_metric_readback_v1", - "readback_present": bool( - active_results - or primary_results - or scalar_results.get("blocker_count") - or scalar_results.get("ready") - ), + "readback_present": bool(combined_results), "source": "prometheus_query_api", "query_url": query_url, "active_blockers": active_blockers, diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 9692f1d34..c903487e2 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -8,6 +8,7 @@ from fastapi.testclient import TestClient from src.api.v1 import agents from src.api.v1.agents import router +from src.services import reboot_auto_recovery_slo_scorecard as reboot_slo_scorecard from src.services.reboot_auto_recovery_drill_preflight import ( load_latest_reboot_auto_recovery_drill_preflight, ) @@ -296,6 +297,69 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics( ) +def test_reboot_slo_prometheus_readback_uses_single_combined_query(monkeypatch): + monkeypatch.delenv("AWOOOI_REBOOT_SLO_PROMETHEUS_TIMEOUT_SECONDS", raising=False) + calls = [] + + def fake_query_prometheus_vector(query_url, query, *, timeout_seconds): + calls.append((query_url, query, timeout_seconds)) + active_metric = "awoooi_reboot_auto_recovery_slo_active_blocker" + return [ + *[ + { + "metric": {"__name__": active_metric, "blocker": blocker}, + "value": [1783010479, "1"], + } + for blocker in PROMETHEUS_RUNTIME_BLOCKERS + ], + { + "metric": { + "__name__": "awoooi_reboot_auto_recovery_slo_primary_blocker", + "blocker": "reboot_event_required_host_unreachable", + }, + "value": [1783010479, "1"], + }, + { + "metric": { + "__name__": "awoooi_reboot_auto_recovery_slo_blocker_count", + }, + "value": [1783010479, "7"], + }, + { + "metric": {"__name__": "awoooi_reboot_auto_recovery_slo_ready"}, + "value": [1783010479, "0"], + }, + { + "metric": { + "__name__": "awoooi_reboot_auto_recovery_slo_last_run_timestamp", + }, + "value": [1783010479, "1783010479"], + }, + ] + + monkeypatch.setattr( + reboot_slo_scorecard, + "_query_prometheus_vector", + fake_query_prometheus_vector, + ) + + payload = reboot_slo_scorecard._load_reboot_slo_prometheus_metric_readback( + prometheus_query_url="http://prometheus.example/api/v1/query" + ) + + assert len(calls) == 1 + assert calls[0][0] == "http://prometheus.example/api/v1/query" + assert calls[0][2] == 1.0 + assert "awoooi_reboot_auto_recovery_slo_active_blocker" in calls[0][1] + assert "awoooi_reboot_auto_recovery_slo_last_run_timestamp" in calls[0][1] + assert payload["readback_present"] is True + assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS + assert payload["active_blocker_count"] == 7 + assert payload["primary_blocker"] == "reboot_event_required_host_unreachable" + assert payload["ready"] is False + assert payload["last_run_timestamp"] == 1783010479 + + def test_reboot_auto_recovery_slo_scorecard_endpoint_returns_readback(monkeypatch): monkeypatch.setattr( agents,