fix(reboot): bound slo prometheus readback latency
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 58s
CD Pipeline / build-and-deploy (push) Successful in 5m57s
CD Pipeline / post-deploy-checks (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 58s
CD Pipeline / build-and-deploy (push) Successful in 5m57s
CD Pipeline / post-deploy-checks (push) Has been cancelled
This commit is contained in:
@@ -45,6 +45,11 @@ _PROMETHEUS_RUNTIME_QUERIES = {
|
||||
"ready": "awoooi_reboot_auto_recovery_slo_ready",
|
||||
"last_run_timestamp": "awoooi_reboot_auto_recovery_slo_last_run_timestamp",
|
||||
}
|
||||
_PROMETHEUS_RUNTIME_COMBINED_QUERY = (
|
||||
'{__name__=~"'
|
||||
+ "|".join(_PROMETHEUS_RUNTIME_QUERIES.values())
|
||||
+ '"}'
|
||||
)
|
||||
_PUBLIC_MAINTENANCE_BLOCKERS = {
|
||||
"public_maintenance_fallback_runtime_readback_missing",
|
||||
"public_route_raw_5xx_without_maintenance_fallback",
|
||||
@@ -213,27 +218,13 @@ def _load_reboot_slo_prometheus_metric_readback(
|
||||
or os.environ.get(_PROMETHEUS_QUERY_URL_ENV, "").strip()
|
||||
or _DEFAULT_PROMETHEUS_QUERY_URL
|
||||
)
|
||||
timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=3.0)
|
||||
timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=1.0)
|
||||
try:
|
||||
active_results = _query_prometheus_vector(
|
||||
combined_results = _query_prometheus_vector(
|
||||
query_url,
|
||||
_PROMETHEUS_RUNTIME_QUERIES["active_blocker"],
|
||||
_PROMETHEUS_RUNTIME_COMBINED_QUERY,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
primary_results = _query_prometheus_vector(
|
||||
query_url,
|
||||
_PROMETHEUS_RUNTIME_QUERIES["primary_blocker"],
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
scalar_results = {
|
||||
key: _query_prometheus_vector(
|
||||
query_url,
|
||||
query,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
for key, query in _PROMETHEUS_RUNTIME_QUERIES.items()
|
||||
if key not in {"active_blocker", "primary_blocker"}
|
||||
}
|
||||
except (
|
||||
OSError,
|
||||
TimeoutError,
|
||||
@@ -249,6 +240,24 @@ def _load_reboot_slo_prometheus_metric_readback(
|
||||
"error_class": exc.__class__.__name__,
|
||||
}
|
||||
|
||||
results_by_name: dict[str, list[dict[str, Any]]] = {}
|
||||
for result in combined_results:
|
||||
metric_name = str(_dict(result.get("metric")).get("__name__") or "")
|
||||
if metric_name:
|
||||
results_by_name.setdefault(metric_name, []).append(result)
|
||||
active_results = results_by_name.get(
|
||||
_PROMETHEUS_RUNTIME_QUERIES["active_blocker"],
|
||||
[],
|
||||
)
|
||||
primary_results = results_by_name.get(
|
||||
_PROMETHEUS_RUNTIME_QUERIES["primary_blocker"],
|
||||
[],
|
||||
)
|
||||
scalar_results = {
|
||||
key: results_by_name.get(query, [])
|
||||
for key, query in _PROMETHEUS_RUNTIME_QUERIES.items()
|
||||
if key not in {"active_blocker", "primary_blocker"}
|
||||
}
|
||||
active_blockers = _unique_strings(
|
||||
[
|
||||
str(_dict(result.get("metric")).get("blocker") or "")
|
||||
@@ -277,12 +286,7 @@ def _load_reboot_slo_prometheus_metric_readback(
|
||||
)
|
||||
return {
|
||||
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
|
||||
"readback_present": bool(
|
||||
active_results
|
||||
or primary_results
|
||||
or scalar_results.get("blocker_count")
|
||||
or scalar_results.get("ready")
|
||||
),
|
||||
"readback_present": bool(combined_results),
|
||||
"source": "prometheus_query_api",
|
||||
"query_url": query_url,
|
||||
"active_blockers": active_blockers,
|
||||
|
||||
@@ -8,6 +8,7 @@ from fastapi.testclient import TestClient
|
||||
|
||||
from src.api.v1 import agents
|
||||
from src.api.v1.agents import router
|
||||
from src.services import reboot_auto_recovery_slo_scorecard as reboot_slo_scorecard
|
||||
from src.services.reboot_auto_recovery_drill_preflight import (
|
||||
load_latest_reboot_auto_recovery_drill_preflight,
|
||||
)
|
||||
@@ -296,6 +297,69 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
|
||||
)
|
||||
|
||||
|
||||
def test_reboot_slo_prometheus_readback_uses_single_combined_query(monkeypatch):
|
||||
monkeypatch.delenv("AWOOOI_REBOOT_SLO_PROMETHEUS_TIMEOUT_SECONDS", raising=False)
|
||||
calls = []
|
||||
|
||||
def fake_query_prometheus_vector(query_url, query, *, timeout_seconds):
|
||||
calls.append((query_url, query, timeout_seconds))
|
||||
active_metric = "awoooi_reboot_auto_recovery_slo_active_blocker"
|
||||
return [
|
||||
*[
|
||||
{
|
||||
"metric": {"__name__": active_metric, "blocker": blocker},
|
||||
"value": [1783010479, "1"],
|
||||
}
|
||||
for blocker in PROMETHEUS_RUNTIME_BLOCKERS
|
||||
],
|
||||
{
|
||||
"metric": {
|
||||
"__name__": "awoooi_reboot_auto_recovery_slo_primary_blocker",
|
||||
"blocker": "reboot_event_required_host_unreachable",
|
||||
},
|
||||
"value": [1783010479, "1"],
|
||||
},
|
||||
{
|
||||
"metric": {
|
||||
"__name__": "awoooi_reboot_auto_recovery_slo_blocker_count",
|
||||
},
|
||||
"value": [1783010479, "7"],
|
||||
},
|
||||
{
|
||||
"metric": {"__name__": "awoooi_reboot_auto_recovery_slo_ready"},
|
||||
"value": [1783010479, "0"],
|
||||
},
|
||||
{
|
||||
"metric": {
|
||||
"__name__": "awoooi_reboot_auto_recovery_slo_last_run_timestamp",
|
||||
},
|
||||
"value": [1783010479, "1783010479"],
|
||||
},
|
||||
]
|
||||
|
||||
monkeypatch.setattr(
|
||||
reboot_slo_scorecard,
|
||||
"_query_prometheus_vector",
|
||||
fake_query_prometheus_vector,
|
||||
)
|
||||
|
||||
payload = reboot_slo_scorecard._load_reboot_slo_prometheus_metric_readback(
|
||||
prometheus_query_url="http://prometheus.example/api/v1/query"
|
||||
)
|
||||
|
||||
assert len(calls) == 1
|
||||
assert calls[0][0] == "http://prometheus.example/api/v1/query"
|
||||
assert calls[0][2] == 1.0
|
||||
assert "awoooi_reboot_auto_recovery_slo_active_blocker" in calls[0][1]
|
||||
assert "awoooi_reboot_auto_recovery_slo_last_run_timestamp" in calls[0][1]
|
||||
assert payload["readback_present"] is True
|
||||
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
|
||||
assert payload["active_blocker_count"] == 7
|
||||
assert payload["primary_blocker"] == "reboot_event_required_host_unreachable"
|
||||
assert payload["ready"] is False
|
||||
assert payload["last_run_timestamp"] == 1783010479
|
||||
|
||||
|
||||
def test_reboot_auto_recovery_slo_scorecard_endpoint_returns_readback(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
agents,
|
||||
|
||||
Reference in New Issue
Block a user