ewoooc/services/ai_automation_smoke_service.py

"""Smoke checks for the four-agent AI automation control plane.

The checks are read-only and intentionally avoid outbound network calls. They
are meant for a fast dashboard/API sanity check, not for deep production probes.
"""

from __future__ import annotations

import os
import json
import threading
from datetime import datetime, timedelta, timezone
from html import escape
from typing import Any, Dict, List

from sqlalchemy import text

from config import SYSTEM_VERSION
from database.manager import get_session


STATUS_RANK = {"ok": 0, "warning": 1, "critical": 2}
_HISTORY_PATH = os.getenv(
    "MOMO_AI_AUTOMATION_SMOKE_HISTORY",
    os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "ai_automation_smoke_history.jsonl"),
)
_HISTORY_LIMIT = int(os.getenv("MOMO_AI_AUTOMATION_SMOKE_HISTORY_LIMIT", "200"))
_HISTORY_LOCK = threading.Lock()


def _check(name: str, status: str, summary: str, details: Dict[str, Any] | None = None) -> Dict[str, Any]:
    return {
        "name": name,
        "status": status,
        "summary": summary,
        "details": details or {},
    }


def _count_jsonl_lines(path: str) -> int:
    try:
        with open(path, "r", encoding="utf-8") as fh:
            return sum(1 for line in fh if line.strip())
    except FileNotFoundError:
        return 0


def _compact_history_record(result: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "generated_at": result.get("generated_at"),
        "status": result.get("status", "critical"),
        "summary": result.get("summary", {}),
        "checks": [
            {
                "name": item.get("name"),
                "status": item.get("status"),
                "summary": item.get("summary"),
            }
            for item in result.get("checks", [])
        ],
    }


def _read_history_lines() -> List[str]:
    with _HISTORY_LOCK:
        with open(_HISTORY_PATH, "r", encoding="utf-8") as fh:
            return fh.readlines()


def _load_history(limit: int = 20) -> List[Dict[str, Any]]:
    if limit <= 0:
        return []
    try:
        lines = _read_history_lines()
    except FileNotFoundError:
        return []

    records = []
    for line in lines[-limit:]:
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError:
            continue
    return records


def _append_history(result: Dict[str, Any]) -> None:
    record = _compact_history_record(result)
    os.makedirs(os.path.dirname(_HISTORY_PATH), exist_ok=True)
    with _HISTORY_LOCK:
        existing = []
        try:
            with open(_HISTORY_PATH, "r", encoding="utf-8") as fh:
                existing = fh.readlines()
        except FileNotFoundError:
            pass

        existing.append(json.dumps(record, ensure_ascii=False, default=str) + "\n")
        if len(existing) > _HISTORY_LIMIT:
            existing = existing[-_HISTORY_LIMIT:]

        with open(_HISTORY_PATH, "w", encoding="utf-8") as fh:
            fh.writelines(existing)


def _history_summary(records: List[Dict[str, Any]]) -> Dict[str, Any]:
    counts = {"ok": 0, "warning": 0, "critical": 0}
    for record in records:
        status = record.get("status", "critical")
        if status in counts:
            counts[status] += 1
    return {
        "counts": counts,
        "recent": records,
        "latest": records[-1] if records else None,
        "daily": _daily_summary(records),
    }


def _daily_summary(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    by_day: Dict[str, Dict[str, int]] = {}
    for record in records:
        day = str(record.get("generated_at") or "unknown")[:10]
        bucket = by_day.setdefault(day, {"ok": 0, "warning": 0, "critical": 0, "total": 0})
        status = record.get("status", "critical")
        if status in bucket:
            bucket[status] += 1
        bucket["total"] += 1
    return [
        {"date": day, **counts}
        for day, counts in sorted(by_day.items())[-14:]
    ]


def export_smoke_history_jsonl() -> Dict[str, Any]:
    try:
        lines = _read_history_lines()
    except FileNotFoundError:
        lines = []
    return {
        "content": "".join(lines),
        "count": sum(1 for line in lines if line.strip()),
        "path": _HISTORY_PATH,
    }


def clear_smoke_history() -> Dict[str, Any]:
    try:
        cleared = _count_jsonl_lines(_HISTORY_PATH)
        with _HISTORY_LOCK:
            try:
                os.remove(_HISTORY_PATH)
            except FileNotFoundError:
                cleared = 0
        return {"cleared": cleared, "path": _HISTORY_PATH}
    except Exception as exc:
        return {"cleared": 0, "path": _HISTORY_PATH, "error": str(exc)[:300]}


def build_smoke_daily_summary_message(history_limit: int = 200) -> str:
    records = _load_history(limit=history_limit)
    summary = _history_summary(records)
    daily_rows = summary.get("daily", [])
    latest = summary.get("latest") or {}
    counts = summary.get("counts") or {"ok": 0, "warning": 0, "critical": 0}

    lines = [
        "🤖 <b>AI 自動化 Smoke 每日摘要</b>",
        "━━━━━━━━━━━━━━━━━━━━",
        f"版本：<code>{escape(SYSTEM_VERSION)}</code>",
        f"最近狀態：<b>{escape(str(latest.get('status', 'no_data')).upper())}</b>",
        f"最近時間：{escape(str(latest.get('generated_at', '尚無紀錄')))}",
        "",
        "📊 <b>近期統計</b>",
        f"✅ OK：{counts.get('ok', 0)}",
        f"⚠️ Warning：{counts.get('warning', 0)}",
        f"🚨 Critical：{counts.get('critical', 0)}",
    ]

    if daily_rows:
        lines += ["", "🗓️ <b>最近每日摘要</b>"]
        for row in daily_rows[-5:]:
            lines.append(
                f"{escape(str(row.get('date')))}｜"
                f"OK {row.get('ok', 0)} / W {row.get('warning', 0)} / C {row.get('critical', 0)}"
            )
    else:
        lines += ["", "🗓️ 尚無 smoke history；請先開啟 /ai_automation_smoke 執行快檢。"]

    problem_checks = [
        item for item in latest.get("checks", [])
        if item.get("status") in {"warning", "critical"}
    ]
    if problem_checks:
        lines += ["", "🚩 <b>最近異常檢查</b>"]
        for item in problem_checks[:5]:
            lines.append(
                f"{escape(str(item.get('status', '')).upper())}｜"
                f"{escape(str(item.get('name', 'unknown')))}："
                f"{escape(str(item.get('summary', '')))}"
            )

    lines += [
        "━━━━━━━━━━━━━━━━━━━━",
        "入口：/ai_automation_smoke",
    ]
    return "\n".join(lines)


def send_smoke_daily_summary(chat_ids: List[Any] | None = None) -> Dict[str, Any]:
    from services.telegram_templates import send_telegram_with_result

    message = build_smoke_daily_summary_message()
    result = send_telegram_with_result(message, chat_ids=chat_ids)
    return {
        "status": "sent" if result.get("ok") else "failed",
        "telegram": result,
        "message_preview": message[:500],
    }


def _event_router_check() -> Dict[str, Any]:
    try:
        from services import event_router
        from services.ai_automation_metrics import snapshot

        queue_count = _count_jsonl_lines(event_router._QUEUE_PATH)
        metrics = snapshot()
        dispatch_total = sum(
            value for (metric, _labels), value in metrics.get("counters", {}).items()
            if metric == "event_router_dispatch_total"
        )
        status = "warning" if queue_count else "ok"
        summary = "EventRouter 可用，通知 queue 乾淨" if status == "ok" else "EventRouter 可用，但有待回放通知"
        return _check(
            "EventRouter 通知鏈",
            status,
            summary,
            {
                "dispatch_sync": callable(getattr(event_router, "dispatch_sync", None)),
                "notify_failure": callable(getattr(event_router, "notify_failure", None)),
                "queued_deliveries": queue_count,
                "dispatch_metric_total": dispatch_total,
            },
        )
    except Exception as exc:
        return _check("EventRouter 通知鏈", "critical", f"EventRouter smoke 失敗：{exc}")


def _row_mapping(row: Any) -> Dict[str, Any]:
    if row is None:
        return {}
    if isinstance(row, dict):
        return row
    mapping = getattr(row, "_mapping", None)
    if mapping is not None:
        return dict(mapping)
    return {}


def _gemini_egress_check(window_hours: int = 24) -> Dict[str, Any]:
    """Read-only runtime sentinel for unexpected Gemini spend."""
    session = None
    try:
        from services.gemini_guard import gemini_disabled_message, is_gemini_hard_disabled

        session = get_session()
        since_at = datetime.now(timezone.utc) - timedelta(hours=int(window_hours))
        summary_row = session.execute(
            text("""
                SELECT
                    COUNT(*) AS calls,
                    COALESCE(SUM(COALESCE(input_tokens, 0) + COALESCE(output_tokens, 0)), 0) AS tokens,
                    COALESCE(SUM(COALESCE(cost_usd, 0)), 0) AS cost_usd,
                    MAX(called_at) AS last_called
                FROM ai_calls
                WHERE lower(provider) = 'gemini'
                  AND called_at >= :since_at
            """),
            {"since_at": since_at},
        ).fetchone()
        summary = _row_mapping(summary_row)
        calls = int(summary.get("calls") or 0)
        tokens = int(summary.get("tokens") or 0)
        cost_usd = float(summary.get("cost_usd") or 0.0)
        hard_disabled = is_gemini_hard_disabled()

        top_rows = []
        if calls:
            top_rows = [
                _row_mapping(row) for row in session.execute(
                    text("""
                        SELECT
                            caller,
                            model,
                            COUNT(*) AS calls,
                            COALESCE(SUM(COALESCE(input_tokens, 0) + COALESCE(output_tokens, 0)), 0) AS tokens,
                            COALESCE(SUM(COALESCE(cost_usd, 0)), 0) AS cost_usd,
                            MAX(called_at) AS last_called
                        FROM ai_calls
                        WHERE lower(provider) = 'gemini'
                          AND called_at >= :since_at
                        GROUP BY caller, model
                        ORDER BY calls DESC, last_called DESC
                        LIMIT 5
                    """),
                    {"since_at": since_at},
                ).fetchall()
            ]

        details = {
            "window_hours": int(window_hours),
            "since_at": since_at.isoformat(timespec="seconds"),
            "hard_disabled": hard_disabled,
            "calls": calls,
            "tokens": tokens,
            "cost_usd": round(cost_usd, 6),
            "last_called": str(summary.get("last_called") or ""),
            "top_callers": top_rows,
            "guard_reason": gemini_disabled_message("ai_smoke_gemini_egress"),
        }

        if calls == 0:
            return _check(
                "Gemini 出站費用 sentinel",
                "ok",
                f"最近 {window_hours}h Gemini 出站 0 次，費用 $0",
                details,
            )
        if hard_disabled:
            return _check(
                "Gemini 出站費用 sentinel",
                "critical",
                f"Hard-disabled 狀態仍偵測到 Gemini {calls} 次 / ${cost_usd:.6f}",
                details,
            )
        return _check(
            "Gemini 出站費用 sentinel",
            "warning",
            f"Gemini emergency fallback 近 {window_hours}h 使用 {calls} 次 / ${cost_usd:.6f}",
            details,
        )
    except Exception as exc:
        return _check(
            "Gemini 出站費用 sentinel",
            "warning",
            f"Gemini 出站紀錄無法讀取：{exc}",
        )
    finally:
        if session is not None:
            session.close()


def _autoheal_check() -> Dict[str, Any]:
    try:
        import services.auto_heal_service as autoheal

        protected = set(getattr(autoheal, "_PROTECTED_CONTAINERS", set()))
        required = {"momo-db", "momo-postgres"}
        missing = sorted(required - protected)
        allowed_actions = sorted(getattr(autoheal, "_ALLOWED_ACTION_TYPES", set()))
        status = "critical" if missing else "ok"
        summary = "AutoHeal 保護資料庫容器，安全邊界存在" if status == "ok" else "AutoHeal protected resource 缺漏"
        return _check(
            "AutoHeal 安全邊界",
            status,
            summary,
            {
                "protected_containers": sorted(protected),
                "missing_required_protection": missing,
                "allowed_actions": allowed_actions,
            },
        )
    except Exception as exc:
        return _check("AutoHeal 安全邊界", "critical", f"AutoHeal smoke 失敗：{exc}")


def _nemotron_check() -> Dict[str, Any]:
    try:
        import services.nemoton_dispatcher_service as nemotron

        dispatcher_cls = (
            getattr(nemotron, "NemotronDispatcherService", None)
            or getattr(nemotron, "NemotronDispatcher", None)
        )
        fallback_ready = bool(dispatcher_cls and hasattr(dispatcher_cls, "_hermes_rule_fallback"))
        api_key_configured = bool(getattr(nemotron, "NIM_API_KEY", ""))
        call_count = getattr(nemotron, "_nim_call_count", {}).get("count", 0)
        daily_limit = getattr(nemotron, "NIM_DAILY_LIMIT", 80)
        if not fallback_ready:
            status = "critical"
            summary = "NemoTron Hermes fallback 缺失"
        elif not api_key_configured:
            status = "warning"
            summary = "NemoTron API key 未設定，目前會走 Hermes fallback"
        elif call_count >= daily_limit:
            status = "warning"
            summary = "NemoTron 配額已達上限，會走 Hermes fallback"
        else:
            status = "ok"
            summary = "NemoTron 與 Hermes fallback 機制可用"
        return _check(
            "NemoTron fallback",
            status,
            summary,
            {
                "fallback_ready": fallback_ready,
                "dispatcher_class": getattr(dispatcher_cls, "__name__", None),
                "api_key_configured": api_key_configured,
                "call_count": call_count,
                "daily_limit": daily_limit,
            },
        )
    except Exception as exc:
        return _check("NemoTron fallback", "critical", f"NemoTron smoke 失敗：{exc}")


def _embedding_queue_check() -> Dict[str, Any]:
    session = None
    try:
        session = get_session()
        rows = session.execute(
            text("SELECT status, COUNT(*) AS count FROM embedding_retry_queue GROUP BY status")
        ).fetchall()
        counts = {str(row._mapping["status"]): int(row._mapping["count"]) for row in rows}
        pending = counts.get("pending", 0)
        processing = counts.get("processing", 0)
        if pending > 1000 or processing > 200:
            status = "warning"
            summary = "OpenClaw embedding queue backlog 偏高"
        else:
            status = "ok"
            summary = "OpenClaw embedding queue 可讀取且 backlog 正常"
        return _check(
            "OpenClaw embedding queue",
            status,
            summary,
            {"counts": counts, "pending": pending, "processing": processing},
        )
    except Exception as exc:
        return _check(
            "OpenClaw embedding queue",
            "warning",
            f"Embedding queue 無法讀取，可能是 DB 離線或 migration 未套用：{exc}",
        )
    finally:
        if session is not None:
            session.close()


def _elephant_hitl_check() -> Dict[str, Any]:
    try:
        from services.elephant_alpha_autonomous_engine import ElephantAlphaAutonomousEngine

        has_hitl = hasattr(ElephantAlphaAutonomousEngine, "_escalate_to_human")
        has_timeout_guard = hasattr(ElephantAlphaAutonomousEngine, "_run_with_timeout")
        api_key_configured = bool(os.getenv("OPENROUTER_API_KEY") or os.getenv("NVIDIA_API_KEY"))
        if not has_hitl or not has_timeout_guard:
            status = "critical"
            summary = "ElephantAlpha AI 例外決策或 timeout guard 缺失"
        elif not api_key_configured:
            status = "warning"
            summary = "ElephantAlpha AI 例外決策程式可用，但 API key 未設定"
        else:
            status = "ok"
            summary = "ElephantAlpha AI 例外決策與 timeout guard 可用"
        return _check(
            "ElephantAlpha AI 例外決策",
            status,
            summary,
            {
                "hitl_method": has_hitl,
                "timeout_guard": has_timeout_guard,
                "api_key_configured": api_key_configured,
            },
        )
    except Exception as exc:
        return _check("ElephantAlpha AI 例外決策", "critical", f"ElephantAlpha smoke 失敗：{exc}")


def collect_ai_automation_smoke(*, record_history: bool = True, history_limit: int = 20) -> Dict[str, Any]:
    checks: List[Dict[str, Any]] = [
        _event_router_check(),
        _gemini_egress_check(),
        _autoheal_check(),
        _nemotron_check(),
        _embedding_queue_check(),
        _elephant_hitl_check(),
    ]
    worst = max(checks, key=lambda item: STATUS_RANK.get(item["status"], 2))["status"]
    result = {
        "status": worst,
        "version": SYSTEM_VERSION,
        "generated_at": datetime.now().isoformat(timespec="seconds"),
        "checks": checks,
        "summary": {
            "ok": sum(1 for item in checks if item["status"] == "ok"),
            "warning": sum(1 for item in checks if item["status"] == "warning"),
            "critical": sum(1 for item in checks if item["status"] == "critical"),
            "total": len(checks),
        },
    }
    if record_history:
        try:
            _append_history(result)
        except Exception as exc:
            result["history_error"] = str(exc)[:300]
    result["history"] = _history_summary(_load_history(limit=history_limit))
    return result