fix(reboot): overlay slo prometheus runtime blockers
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m0s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-03 00:47:46 +08:00
parent 85d8eeb0db
commit c0f43ae080
3 changed files with 347 additions and 0 deletions

View File

@@ -9,6 +9,9 @@ from __future__ import annotations
import json
import os
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime
from pathlib import Path
from typing import Any
@@ -31,6 +34,17 @@ _PUBLIC_MAINTENANCE_RUNTIME_PATTERN = (
"reboot-auto-recovery-slo-*/public-maintenance-fallback.json"
)
_RUNTIME_SCORECARD_PATTERN = "reboot-auto-recovery-slo-*/scorecard.json"
_PROMETHEUS_READBACK_ENABLED_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_READBACK_ENABLED"
_PROMETHEUS_QUERY_URL_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_QUERY_URL"
_PROMETHEUS_TIMEOUT_SECONDS_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_TIMEOUT_SECONDS"
_DEFAULT_PROMETHEUS_QUERY_URL = "http://192.168.0.110:9090/api/v1/query"
_PROMETHEUS_RUNTIME_QUERIES = {
"active_blocker": "awoooi_reboot_auto_recovery_slo_active_blocker",
"primary_blocker": "awoooi_reboot_auto_recovery_slo_primary_blocker",
"blocker_count": "awoooi_reboot_auto_recovery_slo_blocker_count",
"ready": "awoooi_reboot_auto_recovery_slo_ready",
"last_run_timestamp": "awoooi_reboot_auto_recovery_slo_last_run_timestamp",
}
_PUBLIC_MAINTENANCE_BLOCKERS = {
"public_maintenance_fallback_runtime_readback_missing",
"public_route_raw_5xx_without_maintenance_fallback",
@@ -44,6 +58,8 @@ def load_latest_reboot_auto_recovery_slo_scorecard(
public_maintenance_runtime_dir: Path | None = None,
runtime_scorecard_path: Path | None = None,
runtime_scorecard_dir: Path | None = None,
prometheus_metric_readback: dict[str, Any] | None = None,
prometheus_query_url: str | None = None,
) -> dict[str, Any]:
"""Load the committed P0-006 scorecard and overlay trusted runtime artifacts."""
directory = operations_dir or _DEFAULT_OPERATIONS_DIR
@@ -77,6 +93,15 @@ def load_latest_reboot_auto_recovery_slo_scorecard(
)
if public_maintenance_runtime:
apply_public_maintenance_runtime_readback(payload, public_maintenance_runtime)
metric_readback = prometheus_metric_readback
if metric_readback is None and _env_flag(_PROMETHEUS_READBACK_ENABLED_ENV):
metric_readback = _load_reboot_slo_prometheus_metric_readback(
prometheus_query_url=prometheus_query_url,
)
if metric_readback:
_annotate_prometheus_metric_readback(payload, metric_readback)
if not runtime_scorecard:
_apply_prometheus_metric_active_blockers(payload, metric_readback)
_require_operation_boundaries(payload, str(path))
return payload
@@ -179,6 +204,259 @@ def _load_latest_public_maintenance_runtime_readback(
return _read_json_file(latest)
def _load_reboot_slo_prometheus_metric_readback(
*,
prometheus_query_url: str | None = None,
) -> dict[str, Any]:
query_url = (
prometheus_query_url
or os.environ.get(_PROMETHEUS_QUERY_URL_ENV, "").strip()
or _DEFAULT_PROMETHEUS_QUERY_URL
)
timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=3.0)
try:
active_results = _query_prometheus_vector(
query_url,
_PROMETHEUS_RUNTIME_QUERIES["active_blocker"],
timeout_seconds=timeout_seconds,
)
primary_results = _query_prometheus_vector(
query_url,
_PROMETHEUS_RUNTIME_QUERIES["primary_blocker"],
timeout_seconds=timeout_seconds,
)
scalar_results = {
key: _query_prometheus_vector(
query_url,
query,
timeout_seconds=timeout_seconds,
)
for key, query in _PROMETHEUS_RUNTIME_QUERIES.items()
if key not in {"active_blocker", "primary_blocker"}
}
except (
OSError,
TimeoutError,
urllib.error.URLError,
json.JSONDecodeError,
ValueError,
) as exc:
return {
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
"readback_present": False,
"source": "prometheus_query_api",
"query_url": query_url,
"error_class": exc.__class__.__name__,
}
active_blockers = _unique_strings(
[
str(_dict(result.get("metric")).get("blocker") or "")
for result in active_results
if str(_dict(result.get("metric")).get("blocker") or "")
]
)
primary_blocker = next(
(
str(_dict(result.get("metric")).get("blocker") or "")
for result in primary_results
if str(_dict(result.get("metric")).get("blocker") or "")
),
"",
)
if not primary_blocker:
primary_blocker = _reboot_sop_primary_blocker(active_blockers)
blocker_count = _prometheus_first_int(
scalar_results.get("blocker_count") or [],
default=len(active_blockers),
)
ready = _prometheus_first_int(scalar_results.get("ready") or [], default=0) == 1
last_run_timestamp = _prometheus_first_int(
scalar_results.get("last_run_timestamp") or [],
default=0,
)
return {
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
"readback_present": bool(
active_results
or primary_results
or scalar_results.get("blocker_count")
or scalar_results.get("ready")
),
"source": "prometheus_query_api",
"query_url": query_url,
"active_blockers": active_blockers,
"active_blocker_count": blocker_count,
"primary_blocker": primary_blocker,
"ready": ready,
"last_run_timestamp": last_run_timestamp,
}
def _query_prometheus_vector(
query_url: str,
query: str,
*,
timeout_seconds: float,
) -> list[dict[str, Any]]:
url = f"{query_url}?{urllib.parse.urlencode({'query': query})}"
request = urllib.request.Request(url, method="GET")
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
payload = json.loads(response.read(1_000_000).decode("utf-8", errors="replace"))
if not isinstance(payload, dict) or payload.get("status") != "success":
raise ValueError("prometheus_status_not_success")
return _list_of_dicts(_dict(payload.get("data")).get("result"))
def _prometheus_first_int(results: list[dict[str, Any]], *, default: int) -> int:
if not results:
return default
value_pair = _list(_dict(results[0]).get("value"))
if len(value_pair) < 2:
return default
try:
return int(float(value_pair[1]))
except (TypeError, ValueError):
return default
def _annotate_prometheus_metric_readback(
payload: dict[str, Any],
metric_readback: dict[str, Any],
) -> None:
present = metric_readback.get("readback_present") is True
payload["runtime_metric_readback"] = metric_readback
payload["runtime_metric_readback_present"] = present
payload["runtime_metric_source"] = str(metric_readback.get("source") or "")
payload["runtime_metric_active_blocker_count"] = _int(
metric_readback.get("active_blocker_count")
)
payload["runtime_metric_last_run_timestamp"] = _int(
metric_readback.get("last_run_timestamp")
)
readback = _dict(payload.setdefault("readback", {}))
readback["runtime_metric_readback_present"] = present
readback["runtime_metric_source"] = payload["runtime_metric_source"]
readback["runtime_metric_active_blocker_count"] = payload[
"runtime_metric_active_blocker_count"
]
readback["runtime_metric_last_run_timestamp"] = payload[
"runtime_metric_last_run_timestamp"
]
rollups = _dict(payload.setdefault("rollups", {}))
rollups["runtime_metric_readback_present"] = present
rollups["runtime_metric_active_blocker_count"] = payload[
"runtime_metric_active_blocker_count"
]
rollups["runtime_metric_last_run_timestamp"] = payload[
"runtime_metric_last_run_timestamp"
]
def _apply_prometheus_metric_active_blockers(
payload: dict[str, Any],
metric_readback: dict[str, Any],
) -> None:
if metric_readback.get("readback_present") is not True:
return
active_blockers = _strings(metric_readback.get("active_blockers"))
if not active_blockers and _int(metric_readback.get("active_blocker_count")):
return
can_claim_slo = metric_readback.get("ready") is True and not active_blockers
primary_blocker = str(
metric_readback.get("primary_blocker")
or _reboot_sop_primary_blocker(active_blockers)
)
current_phase = _reboot_sop_current_phase(active_blockers, can_claim_slo)
eta_or_wait_reason = _reboot_sop_eta_or_wait_reason(
scorecard=payload,
active_blockers=active_blockers,
current_phase=current_phase,
primary_blocker=primary_blocker,
)
action_matrix = _build_active_blocker_action_matrix(
active_blockers=active_blockers,
primary_blocker=primary_blocker,
)
primary_action = _dict(action_matrix.get("primary_blocker_action"))
next_safe_action = str(
primary_action.get("next_safe_action")
or _dict(payload.get("reboot_sop_progress")).get("next_safe_action")
or payload.get("next_safe_action")
or ""
)
payload["active_blockers"] = active_blockers
payload["active_blocker_count"] = len(active_blockers)
payload["primary_blocker"] = primary_blocker
payload["current_phase"] = current_phase
payload["eta_or_wait_reason"] = eta_or_wait_reason
payload["next_safe_action"] = next_safe_action
payload["can_claim_all_services_recovered_within_target"] = can_claim_slo
payload["blocked_by_fresh_reboot_window_only"] = active_blockers == [
"host_boot_observation_older_than_target_window"
]
payload["active_blocker_action_matrix"] = action_matrix
progress = _dict(payload.setdefault("reboot_sop_progress", {}))
progress.update(
{
"current_phase": current_phase,
"eta_or_wait_reason": eta_or_wait_reason,
"primary_blocker": primary_blocker,
"active_blockers": active_blockers,
"active_blocker_count": len(active_blockers),
"next_safe_action": next_safe_action,
}
)
readback = _dict(payload.setdefault("readback", {}))
readback.update(
{
"current_phase": current_phase,
"eta_or_wait_reason": eta_or_wait_reason,
"primary_blocker": primary_blocker,
"primary_blocker_owner_lane": primary_action.get("owner_lane", ""),
"primary_blocker_action_category": primary_action.get("category", ""),
"active_blocker_count": len(active_blockers),
"active_blocker_action_count": action_matrix["item_count"],
"telegram_active_blocker_alert_required_count": action_matrix[
"telegram_alert_required_count"
],
"next_safe_action": next_safe_action,
"blocked_by_fresh_reboot_window_only": payload[
"blocked_by_fresh_reboot_window_only"
],
}
)
rollups = _dict(payload.setdefault("rollups", {}))
rollups.update(
{
"active_blocker_count": len(active_blockers),
"active_blocker_action_count": action_matrix["item_count"],
"active_blocker_action_category_counts": action_matrix[
"category_counts"
],
"active_blocker_action_owner_lane_counts": action_matrix[
"owner_lane_counts"
],
"primary_blocker_owner_lane": primary_action.get("owner_lane", ""),
"primary_blocker_action_category": primary_action.get("category", ""),
"telegram_active_blocker_alert_required_count": action_matrix[
"telegram_alert_required_count"
],
"can_claim_all_services_recovered_within_target": can_claim_slo,
"blocked_by_fresh_reboot_window_only": payload[
"blocked_by_fresh_reboot_window_only"
],
}
)
def _path_from_env(name: str) -> Path | None:
value = os.environ.get(name, "").strip()
return Path(value) if value else None
@@ -1471,6 +1749,17 @@ def _int(value: Any) -> int:
return 0
def _env_flag(name: str) -> bool:
return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
def _env_float(name: str, *, default: float) -> float:
try:
return float(os.environ.get(name, "").strip() or default)
except ValueError:
return default
def _percent(value: Any) -> int:
return max(0, min(100, round(float(value or 0))))
@@ -1487,6 +1776,10 @@ def _list_of_dicts(value: Any) -> list[dict[str, Any]]:
return [item for item in value if isinstance(item, dict)]
def _list(value: Any) -> list[Any]:
return value if isinstance(value, list) else []
def _unique_strings(values: list[str]) -> list[str]:
seen: set[str] = set()
unique: list[str] = []

View File

@@ -79,6 +79,28 @@ _SOURCE_SCORECARD = (
/ "awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json"
)
PROMETHEUS_RUNTIME_BLOCKERS = [
"all_required_hosts_not_in_10_minute_reboot_window",
"conversation_event_hot_path_index_migration_source_missing",
"host_boot_observation_older_than_target_window",
"host_unreachable_after_reboot",
"host_uptime_unknown",
"reboot_event_required_host_unreachable",
"windows99_vmware_autostart_readback_missing",
]
PROMETHEUS_RUNTIME_READBACK = {
"schema_version": "reboot_slo_prometheus_metric_readback_v1",
"readback_present": True,
"source": "prometheus_query_api",
"query_url": "http://192.168.0.110:9090/api/v1/query",
"active_blockers": PROMETHEUS_RUNTIME_BLOCKERS,
"active_blocker_count": 7,
"primary_blocker": "reboot_event_required_host_unreachable",
"ready": False,
"last_run_timestamp": 1783010479,
}
def test_reboot_auto_recovery_slo_scorecard_loader_exposes_stockplatform_gate():
payload = load_latest_reboot_auto_recovery_slo_scorecard()
@@ -226,6 +248,34 @@ def test_reboot_auto_recovery_slo_scorecard_uses_latest_runtime_scorecard_from_d
)
def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics():
payload = load_latest_reboot_auto_recovery_slo_scorecard(
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK
)
assert payload["runtime_scorecard_readback_present"] is False
assert payload["runtime_metric_readback_present"] is True
assert payload["runtime_metric_active_blocker_count"] == 7
assert payload["runtime_metric_last_run_timestamp"] == 1783010479
assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS
assert payload["active_blocker_count"] == 7
assert payload["readiness_percent"] == 47
assert payload["primary_blocker"] == "reboot_event_required_host_unreachable"
assert payload["next_safe_action"] == (
"rerun_reboot_event_detector_and_host_probe_verify_only_no_reboot"
)
assert "backup_core_green_not_1" not in payload["active_blockers"]
assert "service_green_not_1" not in payload["active_blockers"]
assert payload["active_blocker_action_matrix"]["item_count"] == 7
assert payload["readback"]["active_blocker_count"] == 7
assert payload["readback"]["runtime_metric_readback_present"] is True
assert payload["rollups"]["active_blocker_count"] == 7
assert payload["rollups"]["runtime_metric_readback_present"] is True
assert payload["rollups"]["primary_blocker_owner_lane"] == (
"reboot_event_detector_and_host_probe"
)
def test_reboot_auto_recovery_slo_scorecard_endpoint_returns_readback(monkeypatch):
monkeypatch.setattr(
agents,

View File

@@ -176,6 +176,10 @@ spec:
value: "24"
- name: AWOOOP_ANSIBLE_CANDIDATE_BACKFILL_STARTUP_SLEEP_SECONDS
value: "60"
- name: AWOOOI_REBOOT_SLO_PROMETHEUS_READBACK_ENABLED
value: "true"
- name: AWOOOI_REBOOT_SLO_PROMETHEUS_QUERY_URL
value: "http://192.168.0.110:9090/api/v1/query"
# 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
volumeMounts:
- name: repair-ssh-key