feat(api): Phase 18.4 OpenClaw 深度分析整合

2026-03-31 Claude Code (統帥批准) 新增功能: - _llm_analyze() 整合 OpenClawService - 使用 analyze_alert() 進行 AI RCA 分析 - 整合 SignOz 監控數據 - 支援 Token/Cost 追蹤 - _map_severity_to_risk(): 嚴重度→風險等級映射 - critical/高 → CRITICAL - warning/medium/中 → MEDIUM - 其他 → LOW - _extract_repair_action(): 從 AI 建議提取可執行動作 - restart/重啟 → restart_deployment/restart_pod - clear/清理/cache → clear_cache - scale/擴展 → scale_up (需人工授權) 閉環強化: 規則引擎初步分類 → OpenClaw AI 深度分析 → 更精準的修復建議 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-31 12:14:54 +08:00
parent 9a2b1d6653
commit d6f37853c5
1 changed files with 91 additions and 42 deletions
--- a/apps/api/src/services/failure_watcher.py
+++ b/apps/api/src/services/failure_watcher.py
@@ -495,65 +495,114 @@ class FailureWatcherService(IFailureWatcher):
        target_resource: str,
        initial_classification: str,
    ) -> dict | None:
-        """LLM 深度分析失敗原因"""
+        """
+        LLM 深度分析失敗原因
+
+        Phase 18.4: OpenClaw 整合
+        2026-03-31 Claude Code (統帥批准)
+
+        使用 OpenClawService 進行 AI 分析，
+        整合 SignOz 監控數據提供更精準的 RCA。
+        """
        try:
-            # 建構 prompt
-            prompt = f"""你是 AIOps 故障分析專家。請分析以下執行失敗:
+            from src.services.openclaw import get_openclaw_service

-錯誤訊息: {error_message}
-操作類型: {operation_type}
-目標資源: {target_resource}
-初步分類: {initial_classification}
+            openclaw = get_openclaw_service()

-請以 JSON 格式回應:
-{{
-    "classification": "TIMEOUT|K8S_ERROR|NETWORK_ERROR|PERMISSION_DENIED|RESOURCE_ERROR|UNKNOWN",
-    "root_cause": "根本原因分析 (30字內)",
-    "suggested_repair": "建議修復策略 (30字內)",
-    "risk_level": "LOW|MEDIUM|CRITICAL",
-    "confidence": 0.0-1.0
-}}
+            # 建構告警上下文
+            alert_context = {
+                "alert_type": "execution_failure",
+                "severity": "warning",
+                "error_message": error_message,
+                "operation_type": operation_type,
+                "target_resource": target_resource,
+                "initial_classification": initial_classification,
+                "source": "failure_watcher",
+            }

-只輸出 JSON，不要其他文字。"""
-
-            # 呼叫 LLM
-            result = await self._model_registry.chat(
-                messages=[{"role": "user", "content": prompt}],
-                task_type="failure_analysis",
+            # 呼叫 OpenClaw 分析 (含 SignOz 整合)
+            analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = (
+                await openclaw.analyze_alert(alert_context)
            )

-            if not result or not result.get("success"):
-                logger.warning(
-                    "llm_failure_analysis_failed",
-                    error=result.get("error") if result else "No response",
-                )
-                return None
-
-            # 解析 JSON
-            content = result.get("content", "")
-            # 嘗試提取 JSON
-            import re
-
-            json_match = re.search(r"\{[^}]+\}", content, re.DOTALL)
-            if json_match:
-                analysis = json.loads(json_match.group())
+            if analysis_result:
+                # 從 OpenClaw 結果建構修復分析
                logger.info(
-                    "llm_failure_analysis_success",
-                    classification=analysis.get("classification"),
-                    risk_level=analysis.get("risk_level"),
-                    confidence=analysis.get("confidence"),
+                    "openclaw_failure_analysis_success",
+                    ai_provider=ai_provider,
+                    severity=analysis_result.severity,
+                    confidence=analysis_result.confidence,
+                    tokens=tokens,
+                    cost_usd=cost,
                )
-                return analysis

+                # 映射 OpenClaw 結果到修復分析格式
+                risk_level = self._map_severity_to_risk(analysis_result.severity)
+
+                return {
+                    "classification": initial_classification,  # 保留規則引擎分類
+                    "root_cause": analysis_result.root_cause_analysis[:100],
+                    "suggested_repair": self._extract_repair_action(
+                        analysis_result.recommended_action
+                    ),
+                    "risk_level": risk_level,
+                    "confidence": analysis_result.confidence,
+                    "ai_provider": ai_provider,
+                    "signoz_trace_url": trace_url,
+                }
+
+            logger.warning(
+                "openclaw_failure_analysis_no_result",
+                raw_response=raw_response[:200] if raw_response else None,
+            )
            return None

        except Exception as e:
            logger.warning(
-                "llm_failure_analysis_error",
+                "openclaw_failure_analysis_error",
                error=str(e),
            )
            return None

+    def _map_severity_to_risk(self, severity: str) -> str:
+        """
+        將 OpenClaw severity 映射到修復風險等級
+
+        Phase 18.4: 嚴重度映射
+        """
+        severity_lower = severity.lower()
+        if severity_lower in ["critical", "高"]:
+            return "CRITICAL"
+        elif severity_lower in ["warning", "medium", "中"]:
+            return "MEDIUM"
+        else:
+            return "LOW"
+
+    def _extract_repair_action(self, recommended_action: str) -> str:
+        """
+        從 OpenClaw 建議中提取可執行的修復動作
+
+        Phase 18.4: 動作提取
+        """
+        action_lower = recommended_action.lower()
+
+        # 識別可自動執行的動作
+        if any(kw in action_lower for kw in ["restart", "重啟", "重新啟動"]):
+            if "deployment" in action_lower or "部署" in action_lower:
+                return "restart_deployment"
+            elif "pod" in action_lower:
+                return "restart_pod"
+            return "restart_pod"  # 預設重啟 Pod
+
+        if any(kw in action_lower for kw in ["clear", "清理", "cache", "快取"]):
+            return "clear_cache"
+
+        if any(kw in action_lower for kw in ["scale", "擴展", "增加"]):
+            return "scale_up"  # 需人工授權
+
+        # 無法自動執行，返回原始建議
+        return recommended_action[:50]
+
    async def _update_audit_log_classification(
        self,
        audit_log_id: str,