Files
ewoooc/services/ai_automation_smoke_service.py
ogt e15d543aa2
Some checks failed
CD Pipeline / deploy (push) Has been cancelled
清除 AI 自動化 P2 人工語意債
2026-07-01 14:01:21 +08:00

510 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Smoke checks for the four-agent AI automation control plane.
The checks are read-only and intentionally avoid outbound network calls. They
are meant for a fast dashboard/API sanity check, not for deep production probes.
"""
from __future__ import annotations
import os
import json
import threading
from datetime import datetime, timedelta, timezone
from html import escape
from typing import Any, Dict, List
from sqlalchemy import text
from config import SYSTEM_VERSION
from database.manager import get_session
STATUS_RANK = {"ok": 0, "warning": 1, "critical": 2}
_HISTORY_PATH = os.getenv(
"MOMO_AI_AUTOMATION_SMOKE_HISTORY",
os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "ai_automation_smoke_history.jsonl"),
)
_HISTORY_LIMIT = int(os.getenv("MOMO_AI_AUTOMATION_SMOKE_HISTORY_LIMIT", "200"))
_HISTORY_LOCK = threading.Lock()
def _check(name: str, status: str, summary: str, details: Dict[str, Any] | None = None) -> Dict[str, Any]:
return {
"name": name,
"status": status,
"summary": summary,
"details": details or {},
}
def _count_jsonl_lines(path: str) -> int:
try:
with open(path, "r", encoding="utf-8") as fh:
return sum(1 for line in fh if line.strip())
except FileNotFoundError:
return 0
def _compact_history_record(result: Dict[str, Any]) -> Dict[str, Any]:
return {
"generated_at": result.get("generated_at"),
"status": result.get("status", "critical"),
"summary": result.get("summary", {}),
"checks": [
{
"name": item.get("name"),
"status": item.get("status"),
"summary": item.get("summary"),
}
for item in result.get("checks", [])
],
}
def _read_history_lines() -> List[str]:
with _HISTORY_LOCK:
with open(_HISTORY_PATH, "r", encoding="utf-8") as fh:
return fh.readlines()
def _load_history(limit: int = 20) -> List[Dict[str, Any]]:
if limit <= 0:
return []
try:
lines = _read_history_lines()
except FileNotFoundError:
return []
records = []
for line in lines[-limit:]:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
continue
return records
def _append_history(result: Dict[str, Any]) -> None:
record = _compact_history_record(result)
os.makedirs(os.path.dirname(_HISTORY_PATH), exist_ok=True)
with _HISTORY_LOCK:
existing = []
try:
with open(_HISTORY_PATH, "r", encoding="utf-8") as fh:
existing = fh.readlines()
except FileNotFoundError:
pass
existing.append(json.dumps(record, ensure_ascii=False, default=str) + "\n")
if len(existing) > _HISTORY_LIMIT:
existing = existing[-_HISTORY_LIMIT:]
with open(_HISTORY_PATH, "w", encoding="utf-8") as fh:
fh.writelines(existing)
def _history_summary(records: List[Dict[str, Any]]) -> Dict[str, Any]:
counts = {"ok": 0, "warning": 0, "critical": 0}
for record in records:
status = record.get("status", "critical")
if status in counts:
counts[status] += 1
return {
"counts": counts,
"recent": records,
"latest": records[-1] if records else None,
"daily": _daily_summary(records),
}
def _daily_summary(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
by_day: Dict[str, Dict[str, int]] = {}
for record in records:
day = str(record.get("generated_at") or "unknown")[:10]
bucket = by_day.setdefault(day, {"ok": 0, "warning": 0, "critical": 0, "total": 0})
status = record.get("status", "critical")
if status in bucket:
bucket[status] += 1
bucket["total"] += 1
return [
{"date": day, **counts}
for day, counts in sorted(by_day.items())[-14:]
]
def export_smoke_history_jsonl() -> Dict[str, Any]:
try:
lines = _read_history_lines()
except FileNotFoundError:
lines = []
return {
"content": "".join(lines),
"count": sum(1 for line in lines if line.strip()),
"path": _HISTORY_PATH,
}
def clear_smoke_history() -> Dict[str, Any]:
try:
cleared = _count_jsonl_lines(_HISTORY_PATH)
with _HISTORY_LOCK:
try:
os.remove(_HISTORY_PATH)
except FileNotFoundError:
cleared = 0
return {"cleared": cleared, "path": _HISTORY_PATH}
except Exception as exc:
return {"cleared": 0, "path": _HISTORY_PATH, "error": str(exc)[:300]}
def build_smoke_daily_summary_message(history_limit: int = 200) -> str:
records = _load_history(limit=history_limit)
summary = _history_summary(records)
daily_rows = summary.get("daily", [])
latest = summary.get("latest") or {}
counts = summary.get("counts") or {"ok": 0, "warning": 0, "critical": 0}
lines = [
"🤖 <b>AI 自動化 Smoke 每日摘要</b>",
"━━━━━━━━━━━━━━━━━━━━",
f"版本:<code>{escape(SYSTEM_VERSION)}</code>",
f"最近狀態:<b>{escape(str(latest.get('status', 'no_data')).upper())}</b>",
f"最近時間:{escape(str(latest.get('generated_at', '尚無紀錄')))}",
"",
"📊 <b>近期統計</b>",
f"✅ OK{counts.get('ok', 0)}",
f"⚠️ Warning{counts.get('warning', 0)}",
f"🚨 Critical{counts.get('critical', 0)}",
]
if daily_rows:
lines += ["", "🗓️ <b>最近每日摘要</b>"]
for row in daily_rows[-5:]:
lines.append(
f"{escape(str(row.get('date')))}"
f"OK {row.get('ok', 0)} / W {row.get('warning', 0)} / C {row.get('critical', 0)}"
)
else:
lines += ["", "🗓️ 尚無 smoke history請先開啟 /ai_automation_smoke 執行快檢。"]
problem_checks = [
item for item in latest.get("checks", [])
if item.get("status") in {"warning", "critical"}
]
if problem_checks:
lines += ["", "🚩 <b>最近異常檢查</b>"]
for item in problem_checks[:5]:
lines.append(
f"{escape(str(item.get('status', '')).upper())}"
f"{escape(str(item.get('name', 'unknown')))}"
f"{escape(str(item.get('summary', '')))}"
)
lines += [
"━━━━━━━━━━━━━━━━━━━━",
"入口:/ai_automation_smoke",
]
return "\n".join(lines)
def send_smoke_daily_summary(chat_ids: List[Any] | None = None) -> Dict[str, Any]:
from services.telegram_templates import send_telegram_with_result
message = build_smoke_daily_summary_message()
result = send_telegram_with_result(message, chat_ids=chat_ids)
return {
"status": "sent" if result.get("ok") else "failed",
"telegram": result,
"message_preview": message[:500],
}
def _event_router_check() -> Dict[str, Any]:
try:
from services import event_router
from services.ai_automation_metrics import snapshot
queue_count = _count_jsonl_lines(event_router._QUEUE_PATH)
metrics = snapshot()
dispatch_total = sum(
value for (metric, _labels), value in metrics.get("counters", {}).items()
if metric == "event_router_dispatch_total"
)
status = "warning" if queue_count else "ok"
summary = "EventRouter 可用,通知 queue 乾淨" if status == "ok" else "EventRouter 可用,但有待回放通知"
return _check(
"EventRouter 通知鏈",
status,
summary,
{
"dispatch_sync": callable(getattr(event_router, "dispatch_sync", None)),
"notify_failure": callable(getattr(event_router, "notify_failure", None)),
"queued_deliveries": queue_count,
"dispatch_metric_total": dispatch_total,
},
)
except Exception as exc:
return _check("EventRouter 通知鏈", "critical", f"EventRouter smoke 失敗:{exc}")
def _row_mapping(row: Any) -> Dict[str, Any]:
if row is None:
return {}
if isinstance(row, dict):
return row
mapping = getattr(row, "_mapping", None)
if mapping is not None:
return dict(mapping)
return {}
def _gemini_egress_check(window_hours: int = 24) -> Dict[str, Any]:
"""Read-only runtime sentinel for unexpected Gemini spend."""
session = None
try:
from services.gemini_guard import gemini_disabled_message, is_gemini_hard_disabled
session = get_session()
since_at = datetime.now(timezone.utc) - timedelta(hours=int(window_hours))
summary_row = session.execute(
text("""
SELECT
COUNT(*) AS calls,
COALESCE(SUM(COALESCE(input_tokens, 0) + COALESCE(output_tokens, 0)), 0) AS tokens,
COALESCE(SUM(COALESCE(cost_usd, 0)), 0) AS cost_usd,
MAX(called_at) AS last_called
FROM ai_calls
WHERE lower(provider) = 'gemini'
AND called_at >= :since_at
"""),
{"since_at": since_at},
).fetchone()
summary = _row_mapping(summary_row)
calls = int(summary.get("calls") or 0)
tokens = int(summary.get("tokens") or 0)
cost_usd = float(summary.get("cost_usd") or 0.0)
hard_disabled = is_gemini_hard_disabled()
top_rows = []
if calls:
top_rows = [
_row_mapping(row) for row in session.execute(
text("""
SELECT
caller,
model,
COUNT(*) AS calls,
COALESCE(SUM(COALESCE(input_tokens, 0) + COALESCE(output_tokens, 0)), 0) AS tokens,
COALESCE(SUM(COALESCE(cost_usd, 0)), 0) AS cost_usd,
MAX(called_at) AS last_called
FROM ai_calls
WHERE lower(provider) = 'gemini'
AND called_at >= :since_at
GROUP BY caller, model
ORDER BY calls DESC, last_called DESC
LIMIT 5
"""),
{"since_at": since_at},
).fetchall()
]
details = {
"window_hours": int(window_hours),
"since_at": since_at.isoformat(timespec="seconds"),
"hard_disabled": hard_disabled,
"calls": calls,
"tokens": tokens,
"cost_usd": round(cost_usd, 6),
"last_called": str(summary.get("last_called") or ""),
"top_callers": top_rows,
"guard_reason": gemini_disabled_message("ai_smoke_gemini_egress"),
}
if calls == 0:
return _check(
"Gemini 出站費用 sentinel",
"ok",
f"最近 {window_hours}h Gemini 出站 0 次,費用 $0",
details,
)
if hard_disabled:
return _check(
"Gemini 出站費用 sentinel",
"critical",
f"Hard-disabled 狀態仍偵測到 Gemini {calls} 次 / ${cost_usd:.6f}",
details,
)
return _check(
"Gemini 出站費用 sentinel",
"warning",
f"Gemini emergency fallback 近 {window_hours}h 使用 {calls} 次 / ${cost_usd:.6f}",
details,
)
except Exception as exc:
return _check(
"Gemini 出站費用 sentinel",
"warning",
f"Gemini 出站紀錄無法讀取:{exc}",
)
finally:
if session is not None:
session.close()
def _autoheal_check() -> Dict[str, Any]:
try:
import services.auto_heal_service as autoheal
protected = set(getattr(autoheal, "_PROTECTED_CONTAINERS", set()))
required = {"momo-db", "momo-postgres"}
missing = sorted(required - protected)
allowed_actions = sorted(getattr(autoheal, "_ALLOWED_ACTION_TYPES", set()))
status = "critical" if missing else "ok"
summary = "AutoHeal 保護資料庫容器,安全邊界存在" if status == "ok" else "AutoHeal protected resource 缺漏"
return _check(
"AutoHeal 安全邊界",
status,
summary,
{
"protected_containers": sorted(protected),
"missing_required_protection": missing,
"allowed_actions": allowed_actions,
},
)
except Exception as exc:
return _check("AutoHeal 安全邊界", "critical", f"AutoHeal smoke 失敗:{exc}")
def _nemotron_check() -> Dict[str, Any]:
try:
import services.nemoton_dispatcher_service as nemotron
dispatcher_cls = (
getattr(nemotron, "NemotronDispatcherService", None)
or getattr(nemotron, "NemotronDispatcher", None)
)
fallback_ready = bool(dispatcher_cls and hasattr(dispatcher_cls, "_hermes_rule_fallback"))
api_key_configured = bool(getattr(nemotron, "NIM_API_KEY", ""))
call_count = getattr(nemotron, "_nim_call_count", {}).get("count", 0)
daily_limit = getattr(nemotron, "NIM_DAILY_LIMIT", 80)
if not fallback_ready:
status = "critical"
summary = "NemoTron Hermes fallback 缺失"
elif not api_key_configured:
status = "warning"
summary = "NemoTron API key 未設定,目前會走 Hermes fallback"
elif call_count >= daily_limit:
status = "warning"
summary = "NemoTron 配額已達上限,會走 Hermes fallback"
else:
status = "ok"
summary = "NemoTron 與 Hermes fallback 機制可用"
return _check(
"NemoTron fallback",
status,
summary,
{
"fallback_ready": fallback_ready,
"dispatcher_class": getattr(dispatcher_cls, "__name__", None),
"api_key_configured": api_key_configured,
"call_count": call_count,
"daily_limit": daily_limit,
},
)
except Exception as exc:
return _check("NemoTron fallback", "critical", f"NemoTron smoke 失敗:{exc}")
def _embedding_queue_check() -> Dict[str, Any]:
session = None
try:
session = get_session()
rows = session.execute(
text("SELECT status, COUNT(*) AS count FROM embedding_retry_queue GROUP BY status")
).fetchall()
counts = {str(row._mapping["status"]): int(row._mapping["count"]) for row in rows}
pending = counts.get("pending", 0)
processing = counts.get("processing", 0)
if pending > 1000 or processing > 200:
status = "warning"
summary = "OpenClaw embedding queue backlog 偏高"
else:
status = "ok"
summary = "OpenClaw embedding queue 可讀取且 backlog 正常"
return _check(
"OpenClaw embedding queue",
status,
summary,
{"counts": counts, "pending": pending, "processing": processing},
)
except Exception as exc:
return _check(
"OpenClaw embedding queue",
"warning",
f"Embedding queue 無法讀取,可能是 DB 離線或 migration 未套用:{exc}",
)
finally:
if session is not None:
session.close()
def _elephant_hitl_check() -> Dict[str, Any]:
try:
from services.elephant_alpha_autonomous_engine import ElephantAlphaAutonomousEngine
has_hitl = hasattr(ElephantAlphaAutonomousEngine, "_escalate_to_human")
has_timeout_guard = hasattr(ElephantAlphaAutonomousEngine, "_run_with_timeout")
api_key_configured = bool(os.getenv("OPENROUTER_API_KEY") or os.getenv("NVIDIA_API_KEY"))
if not has_hitl or not has_timeout_guard:
status = "critical"
summary = "ElephantAlpha AI 例外決策或 timeout guard 缺失"
elif not api_key_configured:
status = "warning"
summary = "ElephantAlpha AI 例外決策程式可用,但 API key 未設定"
else:
status = "ok"
summary = "ElephantAlpha AI 例外決策與 timeout guard 可用"
return _check(
"ElephantAlpha AI 例外決策",
status,
summary,
{
"hitl_method": has_hitl,
"timeout_guard": has_timeout_guard,
"api_key_configured": api_key_configured,
},
)
except Exception as exc:
return _check("ElephantAlpha AI 例外決策", "critical", f"ElephantAlpha smoke 失敗:{exc}")
def collect_ai_automation_smoke(*, record_history: bool = True, history_limit: int = 20) -> Dict[str, Any]:
checks: List[Dict[str, Any]] = [
_event_router_check(),
_gemini_egress_check(),
_autoheal_check(),
_nemotron_check(),
_embedding_queue_check(),
_elephant_hitl_check(),
]
worst = max(checks, key=lambda item: STATUS_RANK.get(item["status"], 2))["status"]
result = {
"status": worst,
"version": SYSTEM_VERSION,
"generated_at": datetime.now().isoformat(timespec="seconds"),
"checks": checks,
"summary": {
"ok": sum(1 for item in checks if item["status"] == "ok"),
"warning": sum(1 for item in checks if item["status"] == "warning"),
"critical": sum(1 for item in checks if item["status"] == "critical"),
"total": len(checks),
},
}
if record_history:
try:
_append_history(result)
except Exception as exc:
result["history_error"] = str(exc)[:300]
result["history"] = _history_summary(_load_history(limit=history_limit))
return result