From d6f37853c55ee1f9124daefc21fbb96378f34f39 Mon Sep 17 00:00:00 2001 From: OG T Date: Tue, 31 Mar 2026 12:14:54 +0800 Subject: [PATCH] =?UTF-8?q?feat(api):=20Phase=2018.4=20OpenClaw=20?= =?UTF-8?q?=E6=B7=B1=E5=BA=A6=E5=88=86=E6=9E=90=E6=95=B4=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2026-03-31 Claude Code (統帥批准) 新增功能: - _llm_analyze() 整合 OpenClawService - 使用 analyze_alert() 進行 AI RCA 分析 - 整合 SignOz 監控數據 - 支援 Token/Cost 追蹤 - _map_severity_to_risk(): 嚴重度→風險等級映射 - critical/高 → CRITICAL - warning/medium/中 → MEDIUM - 其他 → LOW - _extract_repair_action(): 從 AI 建議提取可執行動作 - restart/重啟 → restart_deployment/restart_pod - clear/清理/cache → clear_cache - scale/擴展 → scale_up (需人工授權) 閉環強化: 規則引擎初步分類 → OpenClaw AI 深度分析 → 更精準的修復建議 Co-Authored-By: Claude Opus 4.5 --- apps/api/src/services/failure_watcher.py | 133 ++++++++++++++++------- 1 file changed, 91 insertions(+), 42 deletions(-) diff --git a/apps/api/src/services/failure_watcher.py b/apps/api/src/services/failure_watcher.py index 477c7b0a..ad8786a8 100644 --- a/apps/api/src/services/failure_watcher.py +++ b/apps/api/src/services/failure_watcher.py @@ -495,65 +495,114 @@ class FailureWatcherService(IFailureWatcher): target_resource: str, initial_classification: str, ) -> dict | None: - """LLM 深度分析失敗原因""" + """ + LLM 深度分析失敗原因 + + Phase 18.4: OpenClaw 整合 + 2026-03-31 Claude Code (統帥批准) + + 使用 OpenClawService 進行 AI 分析, + 整合 SignOz 監控數據提供更精準的 RCA。 + """ try: - # 建構 prompt - prompt = f"""你是 AIOps 故障分析專家。請分析以下執行失敗: + from src.services.openclaw import get_openclaw_service -錯誤訊息: {error_message} -操作類型: {operation_type} -目標資源: {target_resource} -初步分類: {initial_classification} + openclaw = get_openclaw_service() -請以 JSON 格式回應: -{{ - "classification": "TIMEOUT|K8S_ERROR|NETWORK_ERROR|PERMISSION_DENIED|RESOURCE_ERROR|UNKNOWN", - "root_cause": "根本原因分析 (30字內)", - "suggested_repair": "建議修復策略 (30字內)", - "risk_level": "LOW|MEDIUM|CRITICAL", - "confidence": 0.0-1.0 -}} + # 建構告警上下文 + alert_context = { + "alert_type": "execution_failure", + "severity": "warning", + "error_message": error_message, + "operation_type": operation_type, + "target_resource": target_resource, + "initial_classification": initial_classification, + "source": "failure_watcher", + } -只輸出 JSON,不要其他文字。""" - - # 呼叫 LLM - result = await self._model_registry.chat( - messages=[{"role": "user", "content": prompt}], - task_type="failure_analysis", + # 呼叫 OpenClaw 分析 (含 SignOz 整合) + analysis_result, ai_provider, raw_response, signoz_metrics, trace_url, tokens, cost = ( + await openclaw.analyze_alert(alert_context) ) - if not result or not result.get("success"): - logger.warning( - "llm_failure_analysis_failed", - error=result.get("error") if result else "No response", - ) - return None - - # 解析 JSON - content = result.get("content", "") - # 嘗試提取 JSON - import re - - json_match = re.search(r"\{[^}]+\}", content, re.DOTALL) - if json_match: - analysis = json.loads(json_match.group()) + if analysis_result: + # 從 OpenClaw 結果建構修復分析 logger.info( - "llm_failure_analysis_success", - classification=analysis.get("classification"), - risk_level=analysis.get("risk_level"), - confidence=analysis.get("confidence"), + "openclaw_failure_analysis_success", + ai_provider=ai_provider, + severity=analysis_result.severity, + confidence=analysis_result.confidence, + tokens=tokens, + cost_usd=cost, ) - return analysis + # 映射 OpenClaw 結果到修復分析格式 + risk_level = self._map_severity_to_risk(analysis_result.severity) + + return { + "classification": initial_classification, # 保留規則引擎分類 + "root_cause": analysis_result.root_cause_analysis[:100], + "suggested_repair": self._extract_repair_action( + analysis_result.recommended_action + ), + "risk_level": risk_level, + "confidence": analysis_result.confidence, + "ai_provider": ai_provider, + "signoz_trace_url": trace_url, + } + + logger.warning( + "openclaw_failure_analysis_no_result", + raw_response=raw_response[:200] if raw_response else None, + ) return None except Exception as e: logger.warning( - "llm_failure_analysis_error", + "openclaw_failure_analysis_error", error=str(e), ) return None + def _map_severity_to_risk(self, severity: str) -> str: + """ + 將 OpenClaw severity 映射到修復風險等級 + + Phase 18.4: 嚴重度映射 + """ + severity_lower = severity.lower() + if severity_lower in ["critical", "高"]: + return "CRITICAL" + elif severity_lower in ["warning", "medium", "中"]: + return "MEDIUM" + else: + return "LOW" + + def _extract_repair_action(self, recommended_action: str) -> str: + """ + 從 OpenClaw 建議中提取可執行的修復動作 + + Phase 18.4: 動作提取 + """ + action_lower = recommended_action.lower() + + # 識別可自動執行的動作 + if any(kw in action_lower for kw in ["restart", "重啟", "重新啟動"]): + if "deployment" in action_lower or "部署" in action_lower: + return "restart_deployment" + elif "pod" in action_lower: + return "restart_pod" + return "restart_pod" # 預設重啟 Pod + + if any(kw in action_lower for kw in ["clear", "清理", "cache", "快取"]): + return "clear_cache" + + if any(kw in action_lower for kw in ["scale", "擴展", "增加"]): + return "scale_up" # 需人工授權 + + # 無法自動執行,返回原始建議 + return recommended_action[:50] + async def _update_audit_log_classification( self, audit_log_id: str,