510 lines
18 KiB
Python
510 lines
18 KiB
Python
"""Smoke checks for the four-agent AI automation control plane.
|
||
|
||
The checks are read-only and intentionally avoid outbound network calls. They
|
||
are meant for a fast dashboard/API sanity check, not for deep production probes.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import json
|
||
import threading
|
||
from datetime import datetime, timedelta, timezone
|
||
from html import escape
|
||
from typing import Any, Dict, List
|
||
|
||
from sqlalchemy import text
|
||
|
||
from config import SYSTEM_VERSION
|
||
from database.manager import get_session
|
||
|
||
|
||
STATUS_RANK = {"ok": 0, "warning": 1, "critical": 2}
|
||
_HISTORY_PATH = os.getenv(
|
||
"MOMO_AI_AUTOMATION_SMOKE_HISTORY",
|
||
os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "ai_automation_smoke_history.jsonl"),
|
||
)
|
||
_HISTORY_LIMIT = int(os.getenv("MOMO_AI_AUTOMATION_SMOKE_HISTORY_LIMIT", "200"))
|
||
_HISTORY_LOCK = threading.Lock()
|
||
|
||
|
||
def _check(name: str, status: str, summary: str, details: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
||
return {
|
||
"name": name,
|
||
"status": status,
|
||
"summary": summary,
|
||
"details": details or {},
|
||
}
|
||
|
||
|
||
def _count_jsonl_lines(path: str) -> int:
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as fh:
|
||
return sum(1 for line in fh if line.strip())
|
||
except FileNotFoundError:
|
||
return 0
|
||
|
||
|
||
def _compact_history_record(result: Dict[str, Any]) -> Dict[str, Any]:
|
||
return {
|
||
"generated_at": result.get("generated_at"),
|
||
"status": result.get("status", "critical"),
|
||
"summary": result.get("summary", {}),
|
||
"checks": [
|
||
{
|
||
"name": item.get("name"),
|
||
"status": item.get("status"),
|
||
"summary": item.get("summary"),
|
||
}
|
||
for item in result.get("checks", [])
|
||
],
|
||
}
|
||
|
||
|
||
def _read_history_lines() -> List[str]:
|
||
with _HISTORY_LOCK:
|
||
with open(_HISTORY_PATH, "r", encoding="utf-8") as fh:
|
||
return fh.readlines()
|
||
|
||
|
||
def _load_history(limit: int = 20) -> List[Dict[str, Any]]:
|
||
if limit <= 0:
|
||
return []
|
||
try:
|
||
lines = _read_history_lines()
|
||
except FileNotFoundError:
|
||
return []
|
||
|
||
records = []
|
||
for line in lines[-limit:]:
|
||
try:
|
||
records.append(json.loads(line))
|
||
except json.JSONDecodeError:
|
||
continue
|
||
return records
|
||
|
||
|
||
def _append_history(result: Dict[str, Any]) -> None:
|
||
record = _compact_history_record(result)
|
||
os.makedirs(os.path.dirname(_HISTORY_PATH), exist_ok=True)
|
||
with _HISTORY_LOCK:
|
||
existing = []
|
||
try:
|
||
with open(_HISTORY_PATH, "r", encoding="utf-8") as fh:
|
||
existing = fh.readlines()
|
||
except FileNotFoundError:
|
||
pass
|
||
|
||
existing.append(json.dumps(record, ensure_ascii=False, default=str) + "\n")
|
||
if len(existing) > _HISTORY_LIMIT:
|
||
existing = existing[-_HISTORY_LIMIT:]
|
||
|
||
with open(_HISTORY_PATH, "w", encoding="utf-8") as fh:
|
||
fh.writelines(existing)
|
||
|
||
|
||
def _history_summary(records: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||
counts = {"ok": 0, "warning": 0, "critical": 0}
|
||
for record in records:
|
||
status = record.get("status", "critical")
|
||
if status in counts:
|
||
counts[status] += 1
|
||
return {
|
||
"counts": counts,
|
||
"recent": records,
|
||
"latest": records[-1] if records else None,
|
||
"daily": _daily_summary(records),
|
||
}
|
||
|
||
|
||
def _daily_summary(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
by_day: Dict[str, Dict[str, int]] = {}
|
||
for record in records:
|
||
day = str(record.get("generated_at") or "unknown")[:10]
|
||
bucket = by_day.setdefault(day, {"ok": 0, "warning": 0, "critical": 0, "total": 0})
|
||
status = record.get("status", "critical")
|
||
if status in bucket:
|
||
bucket[status] += 1
|
||
bucket["total"] += 1
|
||
return [
|
||
{"date": day, **counts}
|
||
for day, counts in sorted(by_day.items())[-14:]
|
||
]
|
||
|
||
|
||
def export_smoke_history_jsonl() -> Dict[str, Any]:
|
||
try:
|
||
lines = _read_history_lines()
|
||
except FileNotFoundError:
|
||
lines = []
|
||
return {
|
||
"content": "".join(lines),
|
||
"count": sum(1 for line in lines if line.strip()),
|
||
"path": _HISTORY_PATH,
|
||
}
|
||
|
||
|
||
def clear_smoke_history() -> Dict[str, Any]:
|
||
try:
|
||
cleared = _count_jsonl_lines(_HISTORY_PATH)
|
||
with _HISTORY_LOCK:
|
||
try:
|
||
os.remove(_HISTORY_PATH)
|
||
except FileNotFoundError:
|
||
cleared = 0
|
||
return {"cleared": cleared, "path": _HISTORY_PATH}
|
||
except Exception as exc:
|
||
return {"cleared": 0, "path": _HISTORY_PATH, "error": str(exc)[:300]}
|
||
|
||
|
||
def build_smoke_daily_summary_message(history_limit: int = 200) -> str:
|
||
records = _load_history(limit=history_limit)
|
||
summary = _history_summary(records)
|
||
daily_rows = summary.get("daily", [])
|
||
latest = summary.get("latest") or {}
|
||
counts = summary.get("counts") or {"ok": 0, "warning": 0, "critical": 0}
|
||
|
||
lines = [
|
||
"🤖 <b>AI 自動化 Smoke 每日摘要</b>",
|
||
"━━━━━━━━━━━━━━━━━━━━",
|
||
f"版本:<code>{escape(SYSTEM_VERSION)}</code>",
|
||
f"最近狀態:<b>{escape(str(latest.get('status', 'no_data')).upper())}</b>",
|
||
f"最近時間:{escape(str(latest.get('generated_at', '尚無紀錄')))}",
|
||
"",
|
||
"📊 <b>近期統計</b>",
|
||
f"✅ OK:{counts.get('ok', 0)}",
|
||
f"⚠️ Warning:{counts.get('warning', 0)}",
|
||
f"🚨 Critical:{counts.get('critical', 0)}",
|
||
]
|
||
|
||
if daily_rows:
|
||
lines += ["", "🗓️ <b>最近每日摘要</b>"]
|
||
for row in daily_rows[-5:]:
|
||
lines.append(
|
||
f"{escape(str(row.get('date')))}|"
|
||
f"OK {row.get('ok', 0)} / W {row.get('warning', 0)} / C {row.get('critical', 0)}"
|
||
)
|
||
else:
|
||
lines += ["", "🗓️ 尚無 smoke history;請先開啟 /ai_automation_smoke 執行快檢。"]
|
||
|
||
problem_checks = [
|
||
item for item in latest.get("checks", [])
|
||
if item.get("status") in {"warning", "critical"}
|
||
]
|
||
if problem_checks:
|
||
lines += ["", "🚩 <b>最近異常檢查</b>"]
|
||
for item in problem_checks[:5]:
|
||
lines.append(
|
||
f"{escape(str(item.get('status', '')).upper())}|"
|
||
f"{escape(str(item.get('name', 'unknown')))}:"
|
||
f"{escape(str(item.get('summary', '')))}"
|
||
)
|
||
|
||
lines += [
|
||
"━━━━━━━━━━━━━━━━━━━━",
|
||
"入口:/ai_automation_smoke",
|
||
]
|
||
return "\n".join(lines)
|
||
|
||
|
||
def send_smoke_daily_summary(chat_ids: List[Any] | None = None) -> Dict[str, Any]:
|
||
from services.telegram_templates import send_telegram_with_result
|
||
|
||
message = build_smoke_daily_summary_message()
|
||
result = send_telegram_with_result(message, chat_ids=chat_ids)
|
||
return {
|
||
"status": "sent" if result.get("ok") else "failed",
|
||
"telegram": result,
|
||
"message_preview": message[:500],
|
||
}
|
||
|
||
|
||
def _event_router_check() -> Dict[str, Any]:
|
||
try:
|
||
from services import event_router
|
||
from services.ai_automation_metrics import snapshot
|
||
|
||
queue_count = _count_jsonl_lines(event_router._QUEUE_PATH)
|
||
metrics = snapshot()
|
||
dispatch_total = sum(
|
||
value for (metric, _labels), value in metrics.get("counters", {}).items()
|
||
if metric == "event_router_dispatch_total"
|
||
)
|
||
status = "warning" if queue_count else "ok"
|
||
summary = "EventRouter 可用,通知 queue 乾淨" if status == "ok" else "EventRouter 可用,但有待回放通知"
|
||
return _check(
|
||
"EventRouter 通知鏈",
|
||
status,
|
||
summary,
|
||
{
|
||
"dispatch_sync": callable(getattr(event_router, "dispatch_sync", None)),
|
||
"notify_failure": callable(getattr(event_router, "notify_failure", None)),
|
||
"queued_deliveries": queue_count,
|
||
"dispatch_metric_total": dispatch_total,
|
||
},
|
||
)
|
||
except Exception as exc:
|
||
return _check("EventRouter 通知鏈", "critical", f"EventRouter smoke 失敗:{exc}")
|
||
|
||
|
||
def _row_mapping(row: Any) -> Dict[str, Any]:
|
||
if row is None:
|
||
return {}
|
||
if isinstance(row, dict):
|
||
return row
|
||
mapping = getattr(row, "_mapping", None)
|
||
if mapping is not None:
|
||
return dict(mapping)
|
||
return {}
|
||
|
||
|
||
def _gemini_egress_check(window_hours: int = 24) -> Dict[str, Any]:
|
||
"""Read-only runtime sentinel for unexpected Gemini spend."""
|
||
session = None
|
||
try:
|
||
from services.gemini_guard import gemini_disabled_message, is_gemini_hard_disabled
|
||
|
||
session = get_session()
|
||
since_at = datetime.now(timezone.utc) - timedelta(hours=int(window_hours))
|
||
summary_row = session.execute(
|
||
text("""
|
||
SELECT
|
||
COUNT(*) AS calls,
|
||
COALESCE(SUM(COALESCE(input_tokens, 0) + COALESCE(output_tokens, 0)), 0) AS tokens,
|
||
COALESCE(SUM(COALESCE(cost_usd, 0)), 0) AS cost_usd,
|
||
MAX(called_at) AS last_called
|
||
FROM ai_calls
|
||
WHERE lower(provider) = 'gemini'
|
||
AND called_at >= :since_at
|
||
"""),
|
||
{"since_at": since_at},
|
||
).fetchone()
|
||
summary = _row_mapping(summary_row)
|
||
calls = int(summary.get("calls") or 0)
|
||
tokens = int(summary.get("tokens") or 0)
|
||
cost_usd = float(summary.get("cost_usd") or 0.0)
|
||
hard_disabled = is_gemini_hard_disabled()
|
||
|
||
top_rows = []
|
||
if calls:
|
||
top_rows = [
|
||
_row_mapping(row) for row in session.execute(
|
||
text("""
|
||
SELECT
|
||
caller,
|
||
model,
|
||
COUNT(*) AS calls,
|
||
COALESCE(SUM(COALESCE(input_tokens, 0) + COALESCE(output_tokens, 0)), 0) AS tokens,
|
||
COALESCE(SUM(COALESCE(cost_usd, 0)), 0) AS cost_usd,
|
||
MAX(called_at) AS last_called
|
||
FROM ai_calls
|
||
WHERE lower(provider) = 'gemini'
|
||
AND called_at >= :since_at
|
||
GROUP BY caller, model
|
||
ORDER BY calls DESC, last_called DESC
|
||
LIMIT 5
|
||
"""),
|
||
{"since_at": since_at},
|
||
).fetchall()
|
||
]
|
||
|
||
details = {
|
||
"window_hours": int(window_hours),
|
||
"since_at": since_at.isoformat(timespec="seconds"),
|
||
"hard_disabled": hard_disabled,
|
||
"calls": calls,
|
||
"tokens": tokens,
|
||
"cost_usd": round(cost_usd, 6),
|
||
"last_called": str(summary.get("last_called") or ""),
|
||
"top_callers": top_rows,
|
||
"guard_reason": gemini_disabled_message("ai_smoke_gemini_egress"),
|
||
}
|
||
|
||
if calls == 0:
|
||
return _check(
|
||
"Gemini 出站費用 sentinel",
|
||
"ok",
|
||
f"最近 {window_hours}h Gemini 出站 0 次,費用 $0",
|
||
details,
|
||
)
|
||
if hard_disabled:
|
||
return _check(
|
||
"Gemini 出站費用 sentinel",
|
||
"critical",
|
||
f"Hard-disabled 狀態仍偵測到 Gemini {calls} 次 / ${cost_usd:.6f}",
|
||
details,
|
||
)
|
||
return _check(
|
||
"Gemini 出站費用 sentinel",
|
||
"warning",
|
||
f"Gemini emergency fallback 近 {window_hours}h 使用 {calls} 次 / ${cost_usd:.6f}",
|
||
details,
|
||
)
|
||
except Exception as exc:
|
||
return _check(
|
||
"Gemini 出站費用 sentinel",
|
||
"warning",
|
||
f"Gemini 出站紀錄無法讀取:{exc}",
|
||
)
|
||
finally:
|
||
if session is not None:
|
||
session.close()
|
||
|
||
|
||
def _autoheal_check() -> Dict[str, Any]:
|
||
try:
|
||
import services.auto_heal_service as autoheal
|
||
|
||
protected = set(getattr(autoheal, "_PROTECTED_CONTAINERS", set()))
|
||
required = {"momo-db", "momo-postgres"}
|
||
missing = sorted(required - protected)
|
||
allowed_actions = sorted(getattr(autoheal, "_ALLOWED_ACTION_TYPES", set()))
|
||
status = "critical" if missing else "ok"
|
||
summary = "AutoHeal 保護資料庫容器,安全邊界存在" if status == "ok" else "AutoHeal protected resource 缺漏"
|
||
return _check(
|
||
"AutoHeal 安全邊界",
|
||
status,
|
||
summary,
|
||
{
|
||
"protected_containers": sorted(protected),
|
||
"missing_required_protection": missing,
|
||
"allowed_actions": allowed_actions,
|
||
},
|
||
)
|
||
except Exception as exc:
|
||
return _check("AutoHeal 安全邊界", "critical", f"AutoHeal smoke 失敗:{exc}")
|
||
|
||
|
||
def _nemotron_check() -> Dict[str, Any]:
|
||
try:
|
||
import services.nemoton_dispatcher_service as nemotron
|
||
|
||
dispatcher_cls = (
|
||
getattr(nemotron, "NemotronDispatcherService", None)
|
||
or getattr(nemotron, "NemotronDispatcher", None)
|
||
)
|
||
fallback_ready = bool(dispatcher_cls and hasattr(dispatcher_cls, "_hermes_rule_fallback"))
|
||
api_key_configured = bool(getattr(nemotron, "NIM_API_KEY", ""))
|
||
call_count = getattr(nemotron, "_nim_call_count", {}).get("count", 0)
|
||
daily_limit = getattr(nemotron, "NIM_DAILY_LIMIT", 80)
|
||
if not fallback_ready:
|
||
status = "critical"
|
||
summary = "NemoTron Hermes fallback 缺失"
|
||
elif not api_key_configured:
|
||
status = "warning"
|
||
summary = "NemoTron API key 未設定,目前會走 Hermes fallback"
|
||
elif call_count >= daily_limit:
|
||
status = "warning"
|
||
summary = "NemoTron 配額已達上限,會走 Hermes fallback"
|
||
else:
|
||
status = "ok"
|
||
summary = "NemoTron 與 Hermes fallback 機制可用"
|
||
return _check(
|
||
"NemoTron fallback",
|
||
status,
|
||
summary,
|
||
{
|
||
"fallback_ready": fallback_ready,
|
||
"dispatcher_class": getattr(dispatcher_cls, "__name__", None),
|
||
"api_key_configured": api_key_configured,
|
||
"call_count": call_count,
|
||
"daily_limit": daily_limit,
|
||
},
|
||
)
|
||
except Exception as exc:
|
||
return _check("NemoTron fallback", "critical", f"NemoTron smoke 失敗:{exc}")
|
||
|
||
|
||
def _embedding_queue_check() -> Dict[str, Any]:
|
||
session = None
|
||
try:
|
||
session = get_session()
|
||
rows = session.execute(
|
||
text("SELECT status, COUNT(*) AS count FROM embedding_retry_queue GROUP BY status")
|
||
).fetchall()
|
||
counts = {str(row._mapping["status"]): int(row._mapping["count"]) for row in rows}
|
||
pending = counts.get("pending", 0)
|
||
processing = counts.get("processing", 0)
|
||
if pending > 1000 or processing > 200:
|
||
status = "warning"
|
||
summary = "OpenClaw embedding queue backlog 偏高"
|
||
else:
|
||
status = "ok"
|
||
summary = "OpenClaw embedding queue 可讀取且 backlog 正常"
|
||
return _check(
|
||
"OpenClaw embedding queue",
|
||
status,
|
||
summary,
|
||
{"counts": counts, "pending": pending, "processing": processing},
|
||
)
|
||
except Exception as exc:
|
||
return _check(
|
||
"OpenClaw embedding queue",
|
||
"warning",
|
||
f"Embedding queue 無法讀取,可能是 DB 離線或 migration 未套用:{exc}",
|
||
)
|
||
finally:
|
||
if session is not None:
|
||
session.close()
|
||
|
||
|
||
def _elephant_hitl_check() -> Dict[str, Any]:
|
||
try:
|
||
from services.elephant_alpha_autonomous_engine import ElephantAlphaAutonomousEngine
|
||
|
||
has_hitl = hasattr(ElephantAlphaAutonomousEngine, "_escalate_to_human")
|
||
has_timeout_guard = hasattr(ElephantAlphaAutonomousEngine, "_run_with_timeout")
|
||
api_key_configured = bool(os.getenv("OPENROUTER_API_KEY") or os.getenv("NVIDIA_API_KEY"))
|
||
if not has_hitl or not has_timeout_guard:
|
||
status = "critical"
|
||
summary = "ElephantAlpha AI 例外決策或 timeout guard 缺失"
|
||
elif not api_key_configured:
|
||
status = "warning"
|
||
summary = "ElephantAlpha AI 例外決策程式可用,但 API key 未設定"
|
||
else:
|
||
status = "ok"
|
||
summary = "ElephantAlpha AI 例外決策與 timeout guard 可用"
|
||
return _check(
|
||
"ElephantAlpha AI 例外決策",
|
||
status,
|
||
summary,
|
||
{
|
||
"hitl_method": has_hitl,
|
||
"timeout_guard": has_timeout_guard,
|
||
"api_key_configured": api_key_configured,
|
||
},
|
||
)
|
||
except Exception as exc:
|
||
return _check("ElephantAlpha AI 例外決策", "critical", f"ElephantAlpha smoke 失敗:{exc}")
|
||
|
||
|
||
def collect_ai_automation_smoke(*, record_history: bool = True, history_limit: int = 20) -> Dict[str, Any]:
|
||
checks: List[Dict[str, Any]] = [
|
||
_event_router_check(),
|
||
_gemini_egress_check(),
|
||
_autoheal_check(),
|
||
_nemotron_check(),
|
||
_embedding_queue_check(),
|
||
_elephant_hitl_check(),
|
||
]
|
||
worst = max(checks, key=lambda item: STATUS_RANK.get(item["status"], 2))["status"]
|
||
result = {
|
||
"status": worst,
|
||
"version": SYSTEM_VERSION,
|
||
"generated_at": datetime.now().isoformat(timespec="seconds"),
|
||
"checks": checks,
|
||
"summary": {
|
||
"ok": sum(1 for item in checks if item["status"] == "ok"),
|
||
"warning": sum(1 for item in checks if item["status"] == "warning"),
|
||
"critical": sum(1 for item in checks if item["status"] == "critical"),
|
||
"total": len(checks),
|
||
},
|
||
}
|
||
if record_history:
|
||
try:
|
||
_append_history(result)
|
||
except Exception as exc:
|
||
result["history_error"] = str(exc)[:300]
|
||
result["history"] = _history_summary(_load_history(limit=history_limit))
|
||
return result
|