fix(reboot): overlay slo prometheus runtime blockers
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m0s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m0s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -9,6 +9,9 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -31,6 +34,17 @@ _PUBLIC_MAINTENANCE_RUNTIME_PATTERN = (
|
||||
"reboot-auto-recovery-slo-*/public-maintenance-fallback.json"
|
||||
)
|
||||
_RUNTIME_SCORECARD_PATTERN = "reboot-auto-recovery-slo-*/scorecard.json"
|
||||
_PROMETHEUS_READBACK_ENABLED_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_READBACK_ENABLED"
|
||||
_PROMETHEUS_QUERY_URL_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_QUERY_URL"
|
||||
_PROMETHEUS_TIMEOUT_SECONDS_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_TIMEOUT_SECONDS"
|
||||
_DEFAULT_PROMETHEUS_QUERY_URL = "http://192.168.0.110:9090/api/v1/query"
|
||||
_PROMETHEUS_RUNTIME_QUERIES = {
|
||||
"active_blocker": "awoooi_reboot_auto_recovery_slo_active_blocker",
|
||||
"primary_blocker": "awoooi_reboot_auto_recovery_slo_primary_blocker",
|
||||
"blocker_count": "awoooi_reboot_auto_recovery_slo_blocker_count",
|
||||
"ready": "awoooi_reboot_auto_recovery_slo_ready",
|
||||
"last_run_timestamp": "awoooi_reboot_auto_recovery_slo_last_run_timestamp",
|
||||
}
|
||||
_PUBLIC_MAINTENANCE_BLOCKERS = {
|
||||
"public_maintenance_fallback_runtime_readback_missing",
|
||||
"public_route_raw_5xx_without_maintenance_fallback",
|
||||
@@ -44,6 +58,8 @@ def load_latest_reboot_auto_recovery_slo_scorecard(
|
||||
public_maintenance_runtime_dir: Path | None = None,
|
||||
runtime_scorecard_path: Path | None = None,
|
||||
runtime_scorecard_dir: Path | None = None,
|
||||
prometheus_metric_readback: dict[str, Any] | None = None,
|
||||
prometheus_query_url: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Load the committed P0-006 scorecard and overlay trusted runtime artifacts."""
|
||||
directory = operations_dir or _DEFAULT_OPERATIONS_DIR
|
||||
@@ -77,6 +93,15 @@ def load_latest_reboot_auto_recovery_slo_scorecard(
|
||||
)
|
||||
if public_maintenance_runtime:
|
||||
apply_public_maintenance_runtime_readback(payload, public_maintenance_runtime)
|
||||
metric_readback = prometheus_metric_readback
|
||||
if metric_readback is None and _env_flag(_PROMETHEUS_READBACK_ENABLED_ENV):
|
||||
metric_readback = _load_reboot_slo_prometheus_metric_readback(
|
||||
prometheus_query_url=prometheus_query_url,
|
||||
)
|
||||
if metric_readback:
|
||||
_annotate_prometheus_metric_readback(payload, metric_readback)
|
||||
if not runtime_scorecard:
|
||||
_apply_prometheus_metric_active_blockers(payload, metric_readback)
|
||||
_require_operation_boundaries(payload, str(path))
|
||||
return payload
|
||||
|
||||
@@ -179,6 +204,259 @@ def _load_latest_public_maintenance_runtime_readback(
|
||||
return _read_json_file(latest)
|
||||
|
||||
|
||||
def _load_reboot_slo_prometheus_metric_readback(
|
||||
*,
|
||||
prometheus_query_url: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
query_url = (
|
||||
prometheus_query_url
|
||||
or os.environ.get(_PROMETHEUS_QUERY_URL_ENV, "").strip()
|
||||
or _DEFAULT_PROMETHEUS_QUERY_URL
|
||||
)
|
||||
timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=3.0)
|
||||
try:
|
||||
active_results = _query_prometheus_vector(
|
||||
query_url,
|
||||
_PROMETHEUS_RUNTIME_QUERIES["active_blocker"],
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
primary_results = _query_prometheus_vector(
|
||||
query_url,
|
||||
_PROMETHEUS_RUNTIME_QUERIES["primary_blocker"],
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
scalar_results = {
|
||||
key: _query_prometheus_vector(
|
||||
query_url,
|
||||
query,
|
||||
timeout_seconds=timeout_seconds,
|
||||
)
|
||||
for key, query in _PROMETHEUS_RUNTIME_QUERIES.items()
|
||||
if key not in {"active_blocker", "primary_blocker"}
|
||||
}
|
||||
except (
|
||||
OSError,
|
||||
TimeoutError,
|
||||
urllib.error.URLError,
|
||||
json.JSONDecodeError,
|
||||
ValueError,
|
||||
) as exc:
|
||||
return {
|
||||
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
|
||||
"readback_present": False,
|
||||
"source": "prometheus_query_api",
|
||||
"query_url": query_url,
|
||||
"error_class": exc.__class__.__name__,
|
||||
}
|
||||
|
||||
active_blockers = _unique_strings(
|
||||
[
|
||||
str(_dict(result.get("metric")).get("blocker") or "")
|
||||
for result in active_results
|
||||
if str(_dict(result.get("metric")).get("blocker") or "")
|
||||
]
|
||||
)
|
||||
primary_blocker = next(
|
||||
(
|
||||
str(_dict(result.get("metric")).get("blocker") or "")
|
||||
for result in primary_results
|
||||
if str(_dict(result.get("metric")).get("blocker") or "")
|
||||
),
|
||||
"",
|
||||
)
|
||||
if not primary_blocker:
|
||||
primary_blocker = _reboot_sop_primary_blocker(active_blockers)
|
||||
blocker_count = _prometheus_first_int(
|
||||
scalar_results.get("blocker_count") or [],
|
||||
default=len(active_blockers),
|
||||
)
|
||||
ready = _prometheus_first_int(scalar_results.get("ready") or [], default=0) == 1
|
||||
last_run_timestamp = _prometheus_first_int(
|
||||
scalar_results.get("last_run_timestamp") or [],
|
||||
default=0,
|
||||
)
|
||||
return {
|
||||
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
|
||||
"readback_present": bool(
|
||||
active_results
|
||||
or primary_results
|
||||
or scalar_results.get("blocker_count")
|
||||
or scalar_results.get("ready")
|
||||
),
|
||||
"source": "prometheus_query_api",
|
||||
"query_url": query_url,
|
||||
"active_blockers": active_blockers,
|
||||
"active_blocker_count": blocker_count,
|
||||
"primary_blocker": primary_blocker,
|
||||
"ready": ready,
|
||||
"last_run_timestamp": last_run_timestamp,
|
||||
}
|
||||
|
||||
|
||||
def _query_prometheus_vector(
|
||||
query_url: str,
|
||||
query: str,
|
||||
*,
|
||||
timeout_seconds: float,
|
||||
) -> list[dict[str, Any]]:
|
||||
url = f"{query_url}?{urllib.parse.urlencode({'query': query})}"
|
||||
request = urllib.request.Request(url, method="GET")
|
||||
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
|
||||
payload = json.loads(response.read(1_000_000).decode("utf-8", errors="replace"))
|
||||
if not isinstance(payload, dict) or payload.get("status") != "success":
|
||||
raise ValueError("prometheus_status_not_success")
|
||||
return _list_of_dicts(_dict(payload.get("data")).get("result"))
|
||||
|
||||
|
||||
def _prometheus_first_int(results: list[dict[str, Any]], *, default: int) -> int:
|
||||
if not results:
|
||||
return default
|
||||
value_pair = _list(_dict(results[0]).get("value"))
|
||||
if len(value_pair) < 2:
|
||||
return default
|
||||
try:
|
||||
return int(float(value_pair[1]))
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _annotate_prometheus_metric_readback(
|
||||
payload: dict[str, Any],
|
||||
metric_readback: dict[str, Any],
|
||||
) -> None:
|
||||
present = metric_readback.get("readback_present") is True
|
||||
payload["runtime_metric_readback"] = metric_readback
|
||||
payload["runtime_metric_readback_present"] = present
|
||||
payload["runtime_metric_source"] = str(metric_readback.get("source") or "")
|
||||
payload["runtime_metric_active_blocker_count"] = _int(
|
||||
metric_readback.get("active_blocker_count")
|
||||
)
|
||||
payload["runtime_metric_last_run_timestamp"] = _int(
|
||||
metric_readback.get("last_run_timestamp")
|
||||
)
|
||||
|
||||
readback = _dict(payload.setdefault("readback", {}))
|
||||
readback["runtime_metric_readback_present"] = present
|
||||
readback["runtime_metric_source"] = payload["runtime_metric_source"]
|
||||
readback["runtime_metric_active_blocker_count"] = payload[
|
||||
"runtime_metric_active_blocker_count"
|
||||
]
|
||||
readback["runtime_metric_last_run_timestamp"] = payload[
|
||||
"runtime_metric_last_run_timestamp"
|
||||
]
|
||||
|
||||
rollups = _dict(payload.setdefault("rollups", {}))
|
||||
rollups["runtime_metric_readback_present"] = present
|
||||
rollups["runtime_metric_active_blocker_count"] = payload[
|
||||
"runtime_metric_active_blocker_count"
|
||||
]
|
||||
rollups["runtime_metric_last_run_timestamp"] = payload[
|
||||
"runtime_metric_last_run_timestamp"
|
||||
]
|
||||
|
||||
|
||||
def _apply_prometheus_metric_active_blockers(
|
||||
payload: dict[str, Any],
|
||||
metric_readback: dict[str, Any],
|
||||
) -> None:
|
||||
if metric_readback.get("readback_present") is not True:
|
||||
return
|
||||
active_blockers = _strings(metric_readback.get("active_blockers"))
|
||||
if not active_blockers and _int(metric_readback.get("active_blocker_count")):
|
||||
return
|
||||
|
||||
can_claim_slo = metric_readback.get("ready") is True and not active_blockers
|
||||
primary_blocker = str(
|
||||
metric_readback.get("primary_blocker")
|
||||
or _reboot_sop_primary_blocker(active_blockers)
|
||||
)
|
||||
current_phase = _reboot_sop_current_phase(active_blockers, can_claim_slo)
|
||||
eta_or_wait_reason = _reboot_sop_eta_or_wait_reason(
|
||||
scorecard=payload,
|
||||
active_blockers=active_blockers,
|
||||
current_phase=current_phase,
|
||||
primary_blocker=primary_blocker,
|
||||
)
|
||||
action_matrix = _build_active_blocker_action_matrix(
|
||||
active_blockers=active_blockers,
|
||||
primary_blocker=primary_blocker,
|
||||
)
|
||||
primary_action = _dict(action_matrix.get("primary_blocker_action"))
|
||||
next_safe_action = str(
|
||||
primary_action.get("next_safe_action")
|
||||
or _dict(payload.get("reboot_sop_progress")).get("next_safe_action")
|
||||
or payload.get("next_safe_action")
|
||||
or ""
|
||||
)
|
||||
|
||||
payload["active_blockers"] = active_blockers
|
||||
payload["active_blocker_count"] = len(active_blockers)
|
||||
payload["primary_blocker"] = primary_blocker
|
||||
payload["current_phase"] = current_phase
|
||||
payload["eta_or_wait_reason"] = eta_or_wait_reason
|
||||
payload["next_safe_action"] = next_safe_action
|
||||
payload["can_claim_all_services_recovered_within_target"] = can_claim_slo
|
||||
payload["blocked_by_fresh_reboot_window_only"] = active_blockers == [
|
||||
"host_boot_observation_older_than_target_window"
|
||||
]
|
||||
payload["active_blocker_action_matrix"] = action_matrix
|
||||
|
||||
progress = _dict(payload.setdefault("reboot_sop_progress", {}))
|
||||
progress.update(
|
||||
{
|
||||
"current_phase": current_phase,
|
||||
"eta_or_wait_reason": eta_or_wait_reason,
|
||||
"primary_blocker": primary_blocker,
|
||||
"active_blockers": active_blockers,
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"next_safe_action": next_safe_action,
|
||||
}
|
||||
)
|
||||
|
||||
readback = _dict(payload.setdefault("readback", {}))
|
||||
readback.update(
|
||||
{
|
||||
"current_phase": current_phase,
|
||||
"eta_or_wait_reason": eta_or_wait_reason,
|
||||
"primary_blocker": primary_blocker,
|
||||
"primary_blocker_owner_lane": primary_action.get("owner_lane", ""),
|
||||
"primary_blocker_action_category": primary_action.get("category", ""),
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"active_blocker_action_count": action_matrix["item_count"],
|
||||
"telegram_active_blocker_alert_required_count": action_matrix[
|
||||
"telegram_alert_required_count"
|
||||
],
|
||||
"next_safe_action": next_safe_action,
|
||||
"blocked_by_fresh_reboot_window_only": payload[
|
||||
"blocked_by_fresh_reboot_window_only"
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
rollups = _dict(payload.setdefault("rollups", {}))
|
||||
rollups.update(
|
||||
{
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"active_blocker_action_count": action_matrix["item_count"],
|
||||
"active_blocker_action_category_counts": action_matrix[
|
||||
"category_counts"
|
||||
],
|
||||
"active_blocker_action_owner_lane_counts": action_matrix[
|
||||
"owner_lane_counts"
|
||||
],
|
||||
"primary_blocker_owner_lane": primary_action.get("owner_lane", ""),
|
||||
"primary_blocker_action_category": primary_action.get("category", ""),
|
||||
"telegram_active_blocker_alert_required_count": action_matrix[
|
||||
"telegram_alert_required_count"
|
||||
],
|
||||
"can_claim_all_services_recovered_within_target": can_claim_slo,
|
||||
"blocked_by_fresh_reboot_window_only": payload[
|
||||
"blocked_by_fresh_reboot_window_only"
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _path_from_env(name: str) -> Path | None:
|
||||
value = os.environ.get(name, "").strip()
|
||||
return Path(value) if value else None
|
||||
@@ -1471,6 +1749,17 @@ def _int(value: Any) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _env_flag(name: str) -> bool:
|
||||
return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _env_float(name: str, *, default: float) -> float:
|
||||
try:
|
||||
return float(os.environ.get(name, "").strip() or default)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _percent(value: Any) -> int:
|
||||
return max(0, min(100, round(float(value or 0))))
|
||||
|
||||
@@ -1487,6 +1776,10 @@ def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
|
||||
return [item for item in value if isinstance(item, dict)]
|
||||
|
||||
|
||||
def _list(value: Any) -> list[Any]:
|
||||
return value if isinstance(value, list) else []
|
||||
|
||||
|
||||
def _unique_strings(values: list[str]) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
unique: list[str] = []
|
||||
|
||||
@@ -79,6 +79,28 @@ _SOURCE_SCORECARD = (
|
||||
/ "awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json"
|
||||
)
|
||||
|
||||
PROMETHEUS_RUNTIME_BLOCKERS = [
|
||||
"all_required_hosts_not_in_10_minute_reboot_window",
|
||||
"conversation_event_hot_path_index_migration_source_missing",
|
||||
"host_boot_observation_older_than_target_window",
|
||||
"host_unreachable_after_reboot",
|
||||
"host_uptime_unknown",
|
||||
"reboot_event_required_host_unreachable",
|
||||
"windows99_vmware_autostart_readback_missing",
|
||||
]
|
||||
|
||||
PROMETHEUS_RUNTIME_READBACK = {
|
||||
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
|
||||
"readback_present": True,
|
||||
"source": "prometheus_query_api",
|
||||
"query_url": "http://192.168.0.110:9090/api/v1/query",
|
||||
"active_blockers": PROMETHEUS_RUNTIME_BLOCKERS,
|
||||
"active_blocker_count": 7,
|
||||
"primary_blocker": "reboot_event_required_host_unreachable",
|
||||
"ready": False,
|
||||
"last_run_timestamp": 1783010479,
|
||||
}
|
||||
|
||||
|
||||
def test_reboot_auto_recovery_slo_scorecard_loader_exposes_stockplatform_gate():
|
||||
payload = load_latest_reboot_auto_recovery_slo_scorecard()
|
||||
@@ -226,6 +248,34 @@ def test_reboot_auto_recovery_slo_scorecard_uses_latest_runtime_scorecard_from_d
|
||||
)
|
||||
|
||||
|
||||
def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics():
|
||||
payload = load_latest_reboot_auto_recovery_slo_scorecard(
|
||||
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK
|
||||
)
|
||||
|
||||
assert payload["runtime_scorecard_readback_present"] is False
|
||||
assert payload["runtime_metric_readback_present"] is True
|
||||
assert payload["runtime_metric_active_blocker_count"] == 7
|
||||
assert payload["runtime_metric_last_run_timestamp"] == 1783010479
|
||||
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
|
||||
assert payload["active_blocker_count"] == 7
|
||||
assert payload["readiness_percent"] == 47
|
||||
assert payload["primary_blocker"] == "reboot_event_required_host_unreachable"
|
||||
assert payload["next_safe_action"] == (
|
||||
"rerun_reboot_event_detector_and_host_probe_verify_only_no_reboot"
|
||||
)
|
||||
assert "backup_core_green_not_1" not in payload["active_blockers"]
|
||||
assert "service_green_not_1" not in payload["active_blockers"]
|
||||
assert payload["active_blocker_action_matrix"]["item_count"] == 7
|
||||
assert payload["readback"]["active_blocker_count"] == 7
|
||||
assert payload["readback"]["runtime_metric_readback_present"] is True
|
||||
assert payload["rollups"]["active_blocker_count"] == 7
|
||||
assert payload["rollups"]["runtime_metric_readback_present"] is True
|
||||
assert payload["rollups"]["primary_blocker_owner_lane"] == (
|
||||
"reboot_event_detector_and_host_probe"
|
||||
)
|
||||
|
||||
|
||||
def test_reboot_auto_recovery_slo_scorecard_endpoint_returns_readback(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
agents,
|
||||
|
||||
@@ -176,6 +176,10 @@ spec:
|
||||
value: "24"
|
||||
- name: AWOOOP_ANSIBLE_CANDIDATE_BACKFILL_STARTUP_SLEEP_SECONDS
|
||||
value: "60"
|
||||
- name: AWOOOI_REBOOT_SLO_PROMETHEUS_READBACK_ENABLED
|
||||
value: "true"
|
||||
- name: AWOOOI_REBOOT_SLO_PROMETHEUS_QUERY_URL
|
||||
value: "http://192.168.0.110:9090/api/v1/query"
|
||||
# 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
|
||||
volumeMounts:
|
||||
- name: repair-ssh-key
|
||||
|
||||
Reference in New Issue
Block a user