diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 88be898df..d9cbbfad9 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -9,6 +9,9 @@ from __future__ import annotations import json import os +import urllib.error +import urllib.parse +import urllib.request from datetime import datetime from pathlib import Path from typing import Any @@ -31,6 +34,17 @@ _PUBLIC_MAINTENANCE_RUNTIME_PATTERN = ( "reboot-auto-recovery-slo-*/public-maintenance-fallback.json" ) _RUNTIME_SCORECARD_PATTERN = "reboot-auto-recovery-slo-*/scorecard.json" +_PROMETHEUS_READBACK_ENABLED_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_READBACK_ENABLED" +_PROMETHEUS_QUERY_URL_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_QUERY_URL" +_PROMETHEUS_TIMEOUT_SECONDS_ENV = "AWOOOI_REBOOT_SLO_PROMETHEUS_TIMEOUT_SECONDS" +_DEFAULT_PROMETHEUS_QUERY_URL = "http://192.168.0.110:9090/api/v1/query" +_PROMETHEUS_RUNTIME_QUERIES = { + "active_blocker": "awoooi_reboot_auto_recovery_slo_active_blocker", + "primary_blocker": "awoooi_reboot_auto_recovery_slo_primary_blocker", + "blocker_count": "awoooi_reboot_auto_recovery_slo_blocker_count", + "ready": "awoooi_reboot_auto_recovery_slo_ready", + "last_run_timestamp": "awoooi_reboot_auto_recovery_slo_last_run_timestamp", +} _PUBLIC_MAINTENANCE_BLOCKERS = { "public_maintenance_fallback_runtime_readback_missing", "public_route_raw_5xx_without_maintenance_fallback", @@ -44,6 +58,8 @@ def load_latest_reboot_auto_recovery_slo_scorecard( public_maintenance_runtime_dir: Path | None = None, runtime_scorecard_path: Path | None = None, runtime_scorecard_dir: Path | None = None, + prometheus_metric_readback: dict[str, Any] | None = None, + prometheus_query_url: str | None = None, ) -> dict[str, Any]: """Load the committed P0-006 scorecard and overlay trusted runtime artifacts.""" directory = operations_dir or _DEFAULT_OPERATIONS_DIR @@ -77,6 +93,15 @@ def load_latest_reboot_auto_recovery_slo_scorecard( ) if public_maintenance_runtime: apply_public_maintenance_runtime_readback(payload, public_maintenance_runtime) + metric_readback = prometheus_metric_readback + if metric_readback is None and _env_flag(_PROMETHEUS_READBACK_ENABLED_ENV): + metric_readback = _load_reboot_slo_prometheus_metric_readback( + prometheus_query_url=prometheus_query_url, + ) + if metric_readback: + _annotate_prometheus_metric_readback(payload, metric_readback) + if not runtime_scorecard: + _apply_prometheus_metric_active_blockers(payload, metric_readback) _require_operation_boundaries(payload, str(path)) return payload @@ -179,6 +204,259 @@ def _load_latest_public_maintenance_runtime_readback( return _read_json_file(latest) +def _load_reboot_slo_prometheus_metric_readback( + *, + prometheus_query_url: str | None = None, +) -> dict[str, Any]: + query_url = ( + prometheus_query_url + or os.environ.get(_PROMETHEUS_QUERY_URL_ENV, "").strip() + or _DEFAULT_PROMETHEUS_QUERY_URL + ) + timeout_seconds = _env_float(_PROMETHEUS_TIMEOUT_SECONDS_ENV, default=3.0) + try: + active_results = _query_prometheus_vector( + query_url, + _PROMETHEUS_RUNTIME_QUERIES["active_blocker"], + timeout_seconds=timeout_seconds, + ) + primary_results = _query_prometheus_vector( + query_url, + _PROMETHEUS_RUNTIME_QUERIES["primary_blocker"], + timeout_seconds=timeout_seconds, + ) + scalar_results = { + key: _query_prometheus_vector( + query_url, + query, + timeout_seconds=timeout_seconds, + ) + for key, query in _PROMETHEUS_RUNTIME_QUERIES.items() + if key not in {"active_blocker", "primary_blocker"} + } + except ( + OSError, + TimeoutError, + urllib.error.URLError, + json.JSONDecodeError, + ValueError, + ) as exc: + return { + "schema_version": "reboot_slo_prometheus_metric_readback_v1", + "readback_present": False, + "source": "prometheus_query_api", + "query_url": query_url, + "error_class": exc.__class__.__name__, + } + + active_blockers = _unique_strings( + [ + str(_dict(result.get("metric")).get("blocker") or "") + for result in active_results + if str(_dict(result.get("metric")).get("blocker") or "") + ] + ) + primary_blocker = next( + ( + str(_dict(result.get("metric")).get("blocker") or "") + for result in primary_results + if str(_dict(result.get("metric")).get("blocker") or "") + ), + "", + ) + if not primary_blocker: + primary_blocker = _reboot_sop_primary_blocker(active_blockers) + blocker_count = _prometheus_first_int( + scalar_results.get("blocker_count") or [], + default=len(active_blockers), + ) + ready = _prometheus_first_int(scalar_results.get("ready") or [], default=0) == 1 + last_run_timestamp = _prometheus_first_int( + scalar_results.get("last_run_timestamp") or [], + default=0, + ) + return { + "schema_version": "reboot_slo_prometheus_metric_readback_v1", + "readback_present": bool( + active_results + or primary_results + or scalar_results.get("blocker_count") + or scalar_results.get("ready") + ), + "source": "prometheus_query_api", + "query_url": query_url, + "active_blockers": active_blockers, + "active_blocker_count": blocker_count, + "primary_blocker": primary_blocker, + "ready": ready, + "last_run_timestamp": last_run_timestamp, + } + + +def _query_prometheus_vector( + query_url: str, + query: str, + *, + timeout_seconds: float, +) -> list[dict[str, Any]]: + url = f"{query_url}?{urllib.parse.urlencode({'query': query})}" + request = urllib.request.Request(url, method="GET") + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: + payload = json.loads(response.read(1_000_000).decode("utf-8", errors="replace")) + if not isinstance(payload, dict) or payload.get("status") != "success": + raise ValueError("prometheus_status_not_success") + return _list_of_dicts(_dict(payload.get("data")).get("result")) + + +def _prometheus_first_int(results: list[dict[str, Any]], *, default: int) -> int: + if not results: + return default + value_pair = _list(_dict(results[0]).get("value")) + if len(value_pair) < 2: + return default + try: + return int(float(value_pair[1])) + except (TypeError, ValueError): + return default + + +def _annotate_prometheus_metric_readback( + payload: dict[str, Any], + metric_readback: dict[str, Any], +) -> None: + present = metric_readback.get("readback_present") is True + payload["runtime_metric_readback"] = metric_readback + payload["runtime_metric_readback_present"] = present + payload["runtime_metric_source"] = str(metric_readback.get("source") or "") + payload["runtime_metric_active_blocker_count"] = _int( + metric_readback.get("active_blocker_count") + ) + payload["runtime_metric_last_run_timestamp"] = _int( + metric_readback.get("last_run_timestamp") + ) + + readback = _dict(payload.setdefault("readback", {})) + readback["runtime_metric_readback_present"] = present + readback["runtime_metric_source"] = payload["runtime_metric_source"] + readback["runtime_metric_active_blocker_count"] = payload[ + "runtime_metric_active_blocker_count" + ] + readback["runtime_metric_last_run_timestamp"] = payload[ + "runtime_metric_last_run_timestamp" + ] + + rollups = _dict(payload.setdefault("rollups", {})) + rollups["runtime_metric_readback_present"] = present + rollups["runtime_metric_active_blocker_count"] = payload[ + "runtime_metric_active_blocker_count" + ] + rollups["runtime_metric_last_run_timestamp"] = payload[ + "runtime_metric_last_run_timestamp" + ] + + +def _apply_prometheus_metric_active_blockers( + payload: dict[str, Any], + metric_readback: dict[str, Any], +) -> None: + if metric_readback.get("readback_present") is not True: + return + active_blockers = _strings(metric_readback.get("active_blockers")) + if not active_blockers and _int(metric_readback.get("active_blocker_count")): + return + + can_claim_slo = metric_readback.get("ready") is True and not active_blockers + primary_blocker = str( + metric_readback.get("primary_blocker") + or _reboot_sop_primary_blocker(active_blockers) + ) + current_phase = _reboot_sop_current_phase(active_blockers, can_claim_slo) + eta_or_wait_reason = _reboot_sop_eta_or_wait_reason( + scorecard=payload, + active_blockers=active_blockers, + current_phase=current_phase, + primary_blocker=primary_blocker, + ) + action_matrix = _build_active_blocker_action_matrix( + active_blockers=active_blockers, + primary_blocker=primary_blocker, + ) + primary_action = _dict(action_matrix.get("primary_blocker_action")) + next_safe_action = str( + primary_action.get("next_safe_action") + or _dict(payload.get("reboot_sop_progress")).get("next_safe_action") + or payload.get("next_safe_action") + or "" + ) + + payload["active_blockers"] = active_blockers + payload["active_blocker_count"] = len(active_blockers) + payload["primary_blocker"] = primary_blocker + payload["current_phase"] = current_phase + payload["eta_or_wait_reason"] = eta_or_wait_reason + payload["next_safe_action"] = next_safe_action + payload["can_claim_all_services_recovered_within_target"] = can_claim_slo + payload["blocked_by_fresh_reboot_window_only"] = active_blockers == [ + "host_boot_observation_older_than_target_window" + ] + payload["active_blocker_action_matrix"] = action_matrix + + progress = _dict(payload.setdefault("reboot_sop_progress", {})) + progress.update( + { + "current_phase": current_phase, + "eta_or_wait_reason": eta_or_wait_reason, + "primary_blocker": primary_blocker, + "active_blockers": active_blockers, + "active_blocker_count": len(active_blockers), + "next_safe_action": next_safe_action, + } + ) + + readback = _dict(payload.setdefault("readback", {})) + readback.update( + { + "current_phase": current_phase, + "eta_or_wait_reason": eta_or_wait_reason, + "primary_blocker": primary_blocker, + "primary_blocker_owner_lane": primary_action.get("owner_lane", ""), + "primary_blocker_action_category": primary_action.get("category", ""), + "active_blocker_count": len(active_blockers), + "active_blocker_action_count": action_matrix["item_count"], + "telegram_active_blocker_alert_required_count": action_matrix[ + "telegram_alert_required_count" + ], + "next_safe_action": next_safe_action, + "blocked_by_fresh_reboot_window_only": payload[ + "blocked_by_fresh_reboot_window_only" + ], + } + ) + + rollups = _dict(payload.setdefault("rollups", {})) + rollups.update( + { + "active_blocker_count": len(active_blockers), + "active_blocker_action_count": action_matrix["item_count"], + "active_blocker_action_category_counts": action_matrix[ + "category_counts" + ], + "active_blocker_action_owner_lane_counts": action_matrix[ + "owner_lane_counts" + ], + "primary_blocker_owner_lane": primary_action.get("owner_lane", ""), + "primary_blocker_action_category": primary_action.get("category", ""), + "telegram_active_blocker_alert_required_count": action_matrix[ + "telegram_alert_required_count" + ], + "can_claim_all_services_recovered_within_target": can_claim_slo, + "blocked_by_fresh_reboot_window_only": payload[ + "blocked_by_fresh_reboot_window_only" + ], + } + ) + + def _path_from_env(name: str) -> Path | None: value = os.environ.get(name, "").strip() return Path(value) if value else None @@ -1471,6 +1749,17 @@ def _int(value: Any) -> int: return 0 +def _env_flag(name: str) -> bool: + return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"} + + +def _env_float(name: str, *, default: float) -> float: + try: + return float(os.environ.get(name, "").strip() or default) + except ValueError: + return default + + def _percent(value: Any) -> int: return max(0, min(100, round(float(value or 0)))) @@ -1487,6 +1776,10 @@ def _list_of_dicts(value: Any) -> list[dict[str, Any]]: return [item for item in value if isinstance(item, dict)] +def _list(value: Any) -> list[Any]: + return value if isinstance(value, list) else [] + + def _unique_strings(values: list[str]) -> list[str]: seen: set[str] = set() unique: list[str] = [] diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 0c497a434..9a2c3c002 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -79,6 +79,28 @@ _SOURCE_SCORECARD = ( / "awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json" ) +PROMETHEUS_RUNTIME_BLOCKERS = [ + "all_required_hosts_not_in_10_minute_reboot_window", + "conversation_event_hot_path_index_migration_source_missing", + "host_boot_observation_older_than_target_window", + "host_unreachable_after_reboot", + "host_uptime_unknown", + "reboot_event_required_host_unreachable", + "windows99_vmware_autostart_readback_missing", +] + +PROMETHEUS_RUNTIME_READBACK = { + "schema_version": "reboot_slo_prometheus_metric_readback_v1", + "readback_present": True, + "source": "prometheus_query_api", + "query_url": "http://192.168.0.110:9090/api/v1/query", + "active_blockers": PROMETHEUS_RUNTIME_BLOCKERS, + "active_blocker_count": 7, + "primary_blocker": "reboot_event_required_host_unreachable", + "ready": False, + "last_run_timestamp": 1783010479, +} + def test_reboot_auto_recovery_slo_scorecard_loader_exposes_stockplatform_gate(): payload = load_latest_reboot_auto_recovery_slo_scorecard() @@ -226,6 +248,34 @@ def test_reboot_auto_recovery_slo_scorecard_uses_latest_runtime_scorecard_from_d ) +def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(): + payload = load_latest_reboot_auto_recovery_slo_scorecard( + prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK + ) + + assert payload["runtime_scorecard_readback_present"] is False + assert payload["runtime_metric_readback_present"] is True + assert payload["runtime_metric_active_blocker_count"] == 7 + assert payload["runtime_metric_last_run_timestamp"] == 1783010479 + assert payload["active_blockers"] == PROMETHEUS_RUNTIME_BLOCKERS + assert payload["active_blocker_count"] == 7 + assert payload["readiness_percent"] == 47 + assert payload["primary_blocker"] == "reboot_event_required_host_unreachable" + assert payload["next_safe_action"] == ( + "rerun_reboot_event_detector_and_host_probe_verify_only_no_reboot" + ) + assert "backup_core_green_not_1" not in payload["active_blockers"] + assert "service_green_not_1" not in payload["active_blockers"] + assert payload["active_blocker_action_matrix"]["item_count"] == 7 + assert payload["readback"]["active_blocker_count"] == 7 + assert payload["readback"]["runtime_metric_readback_present"] is True + assert payload["rollups"]["active_blocker_count"] == 7 + assert payload["rollups"]["runtime_metric_readback_present"] is True + assert payload["rollups"]["primary_blocker_owner_lane"] == ( + "reboot_event_detector_and_host_probe" + ) + + def test_reboot_auto_recovery_slo_scorecard_endpoint_returns_readback(monkeypatch): monkeypatch.setattr( agents, diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 178b5f1ce..ab1119799 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -176,6 +176,10 @@ spec: value: "24" - name: AWOOOP_ANSIBLE_CANDIDATE_BACKFILL_STARTUP_SLEEP_SECONDS value: "60" + - name: AWOOOI_REBOOT_SLO_PROMETHEUS_READBACK_ENABLED + value: "true" + - name: AWOOOI_REBOOT_SLO_PROMETHEUS_QUERY_URL + value: "http://192.168.0.110:9090/api/v1/query" # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用 volumeMounts: - name: repair-ssh-key