fix(reboot): bound slo prometheus readback latency
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 58s
CD Pipeline / build-and-deploy (push) Successful in 5m57s
CD Pipeline / post-deploy-checks (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-03 01:08:28 +08:00
parent 4c897d23c9
commit e1f232b153
2 changed files with 91 additions and 23 deletions

View File

@@ -45,6 +45,11 @@ _PROMETHEUS_RUNTIME_QUERIES = {
"ready": "awoooi_reboot_auto_recovery_slo_ready",
"last_run_timestamp": "awoooi_reboot_auto_recovery_slo_last_run_timestamp",
}
_PROMETHEUS_RUNTIME_COMBINED_QUERY = (
'{__name__=~"'
+ "|".join(_PROMETHEUS_RUNTIME_QUERIES.values())
+ '"}'
)
_PUBLIC_MAINTENANCE_BLOCKERS = {
"public_maintenance_fallback_runtime_readback_missing",
"public_route_raw_5xx_without_maintenance_fallback",
@@ -213,27 +218,13 @@ def _load_reboot_slo_prometheus_metric_readback(
or os.environ.get(_PROMETHEUS_QUERY_URL_ENV, "").strip()
or _DEFAULT_PROMETHEUS_QUERY_URL
)
timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=3.0)
timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=1.0)
try:
active_results = _query_prometheus_vector(
combined_results = _query_prometheus_vector(
query_url,
_PROMETHEUS_RUNTIME_QUERIES["active_blocker"],
_PROMETHEUS_RUNTIME_COMBINED_QUERY,
timeout_seconds=timeout_seconds,
)
primary_results = _query_prometheus_vector(
query_url,
_PROMETHEUS_RUNTIME_QUERIES["primary_blocker"],
timeout_seconds=timeout_seconds,
)
scalar_results = {
key: _query_prometheus_vector(
query_url,
query,
timeout_seconds=timeout_seconds,
)
for key, query in _PROMETHEUS_RUNTIME_QUERIES.items()
if key not in {"active_blocker", "primary_blocker"}
}
except (
OSError,
TimeoutError,
@@ -249,6 +240,24 @@ def _load_reboot_slo_prometheus_metric_readback(
"error_class": exc.__class__.__name__,
}
results_by_name: dict[str, list[dict[str, Any]]] = {}
for result in combined_results:
metric_name = str(_dict(result.get("metric")).get("__name__") or "")
if metric_name:
results_by_name.setdefault(metric_name, []).append(result)
active_results = results_by_name.get(
_PROMETHEUS_RUNTIME_QUERIES["active_blocker"],
[],
)
primary_results = results_by_name.get(
_PROMETHEUS_RUNTIME_QUERIES["primary_blocker"],
[],
)
scalar_results = {
key: results_by_name.get(query, [])
for key, query in _PROMETHEUS_RUNTIME_QUERIES.items()
if key not in {"active_blocker", "primary_blocker"}
}
active_blockers = _unique_strings(
[
str(_dict(result.get("metric")).get("blocker") or "")
@@ -277,12 +286,7 @@ def _load_reboot_slo_prometheus_metric_readback(
)
return {
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
"readback_present": bool(
active_results
or primary_results
or scalar_results.get("blocker_count")
or scalar_results.get("ready")
),
"readback_present": bool(combined_results),
"source": "prometheus_query_api",
"query_url": query_url,
"active_blockers": active_blockers,

View File

@@ -8,6 +8,7 @@ from fastapi.testclient import TestClient
from src.api.v1 import agents
from src.api.v1.agents import router
from src.services import reboot_auto_recovery_slo_scorecard as reboot_slo_scorecard
from src.services.reboot_auto_recovery_drill_preflight import (
load_latest_reboot_auto_recovery_drill_preflight,
)
@@ -296,6 +297,69 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
)
def test_reboot_slo_prometheus_readback_uses_single_combined_query(monkeypatch):
monkeypatch.delenv("AWOOOI_REBOOT_SLO_PROMETHEUS_TIMEOUT_SECONDS", raising=False)
calls = []
def fake_query_prometheus_vector(query_url, query, *, timeout_seconds):
calls.append((query_url, query, timeout_seconds))
active_metric = "awoooi_reboot_auto_recovery_slo_active_blocker"
return [
*[
{
"metric": {"__name__": active_metric, "blocker": blocker},
"value": [1783010479, "1"],
}
for blocker in PROMETHEUS_RUNTIME_BLOCKERS
],
{
"metric": {
"__name__": "awoooi_reboot_auto_recovery_slo_primary_blocker",
"blocker": "reboot_event_required_host_unreachable",
},
"value": [1783010479, "1"],
},
{
"metric": {
"__name__": "awoooi_reboot_auto_recovery_slo_blocker_count",
},
"value": [1783010479, "7"],
},
{
"metric": {"__name__": "awoooi_reboot_auto_recovery_slo_ready"},
"value": [1783010479, "0"],
},
{
"metric": {
"__name__": "awoooi_reboot_auto_recovery_slo_last_run_timestamp",
},
"value": [1783010479, "1783010479"],
},
]
monkeypatch.setattr(
reboot_slo_scorecard,
"_query_prometheus_vector",
fake_query_prometheus_vector,
)
payload = reboot_slo_scorecard._load_reboot_slo_prometheus_metric_readback(
prometheus_query_url="http://prometheus.example/api/v1/query"
)
assert len(calls) == 1
assert calls[0][0] == "http://prometheus.example/api/v1/query"
assert calls[0][2] == 1.0
assert "awoooi_reboot_auto_recovery_slo_active_blocker" in calls[0][1]
assert "awoooi_reboot_auto_recovery_slo_last_run_timestamp" in calls[0][1]
assert payload["readback_present"] is True
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
assert payload["active_blocker_count"] == 7
assert payload["primary_blocker"] == "reboot_event_required_host_unreachable"
assert payload["ready"] is False
assert payload["last_run_timestamp"] == 1783010479
def test_reboot_auto_recovery_slo_scorecard_endpoint_returns_readback(monkeypatch):
monkeypatch.setattr(
agents,