From d6f37853c55ee1f9124daefc21fbb96378f34f39 Mon Sep 17 00:00:00 2001
From: OG T <ogt@WOOOMacMiniM4.local>
Date: Tue, 31 Mar 2026 12:14:54 +0800
Subject: [PATCH] =?UTF-8?q?feat(api):=20Phase=2018.4=20OpenClaw=20?=
 =?UTF-8?q?=E6=B7=B1=E5=BA=A6=E5=88=86=E6=9E=90=E6=95=B4=E5=90=88?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

2026-03-31 Claude Code (統帥批准)

新增功能:
- _llm_analyze() 整合 OpenClawService
  - 使用 analyze_alert() 進行 AI RCA 分析
  - 整合 SignOz 監控數據
  - 支援 Token/Cost 追蹤

- _map_severity_to_risk(): 嚴重度→風險等級映射
  - critical/高 → CRITICAL
  - warning/medium/中 → MEDIUM
  - 其他 → LOW

- _extract_repair_action(): 從 AI 建議提取可執行動作
  - restart/重啟 → restart_deployment/restart_pod
  - clear/清理/cache → clear_cache
  - scale/擴展 → scale_up (需人工授權)

閉環強化:
規則引擎初步分類 → OpenClaw AI 深度分析 → 更精準的修復建議

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 apps/api/src/services/failure_watcher.py | 133 ++++++++++++++++-------
 1 file changed, 91 insertions(+), 42 deletions(-)

diff --git a/apps/api/src/services/failure_watcher.py b/apps/api/src/services/failure_watcher.py
index 477c7b0a..ad8786a8 100644
--- a/apps/api/src/services/failure_watcher.py
+++ b/apps/api/src/services/failure_watcher.py
@@ -495,65 +495,114 @@ class FailureWatcherService(IFailureWatcher):
         target_resource: str,
         initial_classification: str,
     ) -> dict | None:
-        """LLM 深度分析失敗原因"""
+        """
+        LLM 深度分析失敗原因
+
+        Phase 18.4: OpenClaw 整合
+        2026-03-31 Claude Code (統帥批准)
+
+        使用 OpenClawService 進行 AI 分析，
+        整合 SignOz 監控數據提供更精準的 RCA。
+        """
         try:
-            # 建構 prompt
-            prompt = f"""你是 AIOps 故障分析專家。請分析以下執行失敗:
+            from src.services.openclaw import get_openclaw_service
 
-錯誤訊息: {error_message}
-操作類型: {operation_type}
-目標資源: {target_resource}
-初步分類: {initial_classification}
+            openclaw = get_openclaw_service()
 
-請以 JSON 格式回應:
-{{
-    "classification": "TIMEOUT|K8S_ERROR|NETWORK_ERROR|PERMISSION_DENIED|RESOURCE_ERROR|UNKNOWN",
-    "root_cause": "根本原因分析 (30字內)",
-    "suggested_repair": "建議修復策略 (30字內)",
-    "risk_level": "LOW|MEDIUM|CRITICAL",
-    "confidence": 0.0-1.0
-}}
+            # 建構告警上下文
+            alert_context = {
+                "alert_type": "execution_failure",
+                "severity": "warning",
+                "error_message": error_message,
+                "operation_type": operation_type,
+                "target_resource": target_resource,
+                "initial_classification": initial_classification,
+                "source": "failure_watcher",
+            }
 
-只輸出 JSON，不要其他文字。"""
-
-            # 呼叫 LLM
-            result = await self._model_registry.chat(
-                messages=[{"role": "user", "content": prompt}],
-                task_type="failure_analysis",
+            # 呼叫 OpenClaw 分析 (含 SignOz 整合)
+            analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = (
+                await openclaw.analyze_alert(alert_context)
             )
 
-            if not result or not result.get("success"):
-                logger.warning(
-                    "llm_failure_analysis_failed",
-                    error=result.get("error") if result else "No response",
-                )
-                return None
-
-            # 解析 JSON
-            content = result.get("content", "")
-            # 嘗試提取 JSON
-            import re
-
-            json_match = re.search(r"\{[^}]+\}", content, re.DOTALL)
-            if json_match:
-                analysis = json.loads(json_match.group())
+            if analysis_result:
+                # 從 OpenClaw 結果建構修復分析
                 logger.info(
-                    "llm_failure_analysis_success",
-                    classification=analysis.get("classification"),
-                    risk_level=analysis.get("risk_level"),
-                    confidence=analysis.get("confidence"),
+                    "openclaw_failure_analysis_success",
+                    ai_provider=ai_provider,
+                    severity=analysis_result.severity,
+                    confidence=analysis_result.confidence,
+                    tokens=tokens,
+                    cost_usd=cost,
                 )
-                return analysis
 
+                # 映射 OpenClaw 結果到修復分析格式
+                risk_level = self._map_severity_to_risk(analysis_result.severity)
+
+                return {
+                    "classification": initial_classification,  # 保留規則引擎分類
+                    "root_cause": analysis_result.root_cause_analysis[:100],
+                    "suggested_repair": self._extract_repair_action(
+                        analysis_result.recommended_action
+                    ),
+                    "risk_level": risk_level,
+                    "confidence": analysis_result.confidence,
+                    "ai_provider": ai_provider,
+                    "signoz_trace_url": trace_url,
+                }
+
+            logger.warning(
+                "openclaw_failure_analysis_no_result",
+                raw_response=raw_response[:200] if raw_response else None,
+            )
             return None
 
         except Exception as e:
             logger.warning(
-                "llm_failure_analysis_error",
+                "openclaw_failure_analysis_error",
                 error=str(e),
             )
             return None
 
+    def _map_severity_to_risk(self, severity: str) -> str:
+        """
+        將 OpenClaw severity 映射到修復風險等級
+
+        Phase 18.4: 嚴重度映射
+        """
+        severity_lower = severity.lower()
+        if severity_lower in ["critical", "高"]:
+            return "CRITICAL"
+        elif severity_lower in ["warning", "medium", "中"]:
+            return "MEDIUM"
+        else:
+            return "LOW"
+
+    def _extract_repair_action(self, recommended_action: str) -> str:
+        """
+        從 OpenClaw 建議中提取可執行的修復動作
+
+        Phase 18.4: 動作提取
+        """
+        action_lower = recommended_action.lower()
+
+        # 識別可自動執行的動作
+        if any(kw in action_lower for kw in ["restart", "重啟", "重新啟動"]):
+            if "deployment" in action_lower or "部署" in action_lower:
+                return "restart_deployment"
+            elif "pod" in action_lower:
+                return "restart_pod"
+            return "restart_pod"  # 預設重啟 Pod
+
+        if any(kw in action_lower for kw in ["clear", "清理", "cache", "快取"]):
+            return "clear_cache"
+
+        if any(kw in action_lower for kw in ["scale", "擴展", "增加"]):
+            return "scale_up"  # 需人工授權
+
+        # 無法自動執行，返回原始建議
+        return recommended_action[:50]
+
     async def _update_audit_log_classification(
         self,
         audit_log_id: str,